diff options
Diffstat (limited to 'kernel')
121 files changed, 5528 insertions, 4412 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c072b6da239..bbde5f1a4486 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ | |||
| 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
| 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
| 13 | async.o range.o groups.o lglock.o smpboot.o | 13 | async.o range.o groups.o lglock.o smpboot.o |
| @@ -25,9 +25,7 @@ endif | |||
| 25 | obj-y += sched/ | 25 | obj-y += sched/ |
| 26 | obj-y += power/ | 26 | obj-y += power/ |
| 27 | 27 | ||
| 28 | ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) | 28 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
| 29 | obj-$(CONFIG_X86) += kcmp.o | ||
| 30 | endif | ||
| 31 | obj-$(CONFIG_FREEZER) += freezer.o | 29 | obj-$(CONFIG_FREEZER) += freezer.o |
| 32 | obj-$(CONFIG_PROFILING) += profile.o | 30 | obj-$(CONFIG_PROFILING) += profile.o |
| 33 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 31 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
| @@ -127,11 +125,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE | |||
| 127 | 125 | ||
| 128 | $(obj)/time.o: $(obj)/timeconst.h | 126 | $(obj)/time.o: $(obj)/timeconst.h |
| 129 | 127 | ||
| 130 | quiet_cmd_timeconst = TIMEC $@ | 128 | quiet_cmd_hzfile = HZFILE $@ |
| 131 | cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ | 129 | cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ |
| 130 | |||
| 131 | targets += hz.bc | ||
| 132 | $(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE | ||
| 133 | $(call if_changed,hzfile) | ||
| 134 | |||
| 135 | quiet_cmd_bc = BC $@ | ||
| 136 | cmd_bc = bc -q $(filter-out FORCE,$^) > $@ | ||
| 137 | |||
| 132 | targets += timeconst.h | 138 | targets += timeconst.h |
| 133 | $(obj)/timeconst.h: $(src)/timeconst.pl FORCE | 139 | $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE |
| 134 | $(call if_changed,timeconst) | 140 | $(call if_changed,bc) |
| 135 | 141 | ||
| 136 | ifeq ($(CONFIG_MODULE_SIG),y) | 142 | ifeq ($(CONFIG_MODULE_SIG),y) |
| 137 | # | 143 | # |
| @@ -153,23 +159,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates | |||
| 153 | # fail and that the kernel may be used afterwards. | 159 | # fail and that the kernel may be used afterwards. |
| 154 | # | 160 | # |
| 155 | ############################################################################### | 161 | ############################################################################### |
| 156 | sign_key_with_hash := | 162 | ifndef CONFIG_MODULE_SIG_HASH |
| 157 | ifeq ($(CONFIG_MODULE_SIG_SHA1),y) | ||
| 158 | sign_key_with_hash := -sha1 | ||
| 159 | endif | ||
| 160 | ifeq ($(CONFIG_MODULE_SIG_SHA224),y) | ||
| 161 | sign_key_with_hash := -sha224 | ||
| 162 | endif | ||
| 163 | ifeq ($(CONFIG_MODULE_SIG_SHA256),y) | ||
| 164 | sign_key_with_hash := -sha256 | ||
| 165 | endif | ||
| 166 | ifeq ($(CONFIG_MODULE_SIG_SHA384),y) | ||
| 167 | sign_key_with_hash := -sha384 | ||
| 168 | endif | ||
| 169 | ifeq ($(CONFIG_MODULE_SIG_SHA512),y) | ||
| 170 | sign_key_with_hash := -sha512 | ||
| 171 | endif | ||
| 172 | ifeq ($(sign_key_with_hash),) | ||
| 173 | $(error Could not determine digest type to use from kernel config) | 163 | $(error Could not determine digest type to use from kernel config) |
| 174 | endif | 164 | endif |
| 175 | 165 | ||
| @@ -182,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey | |||
| 182 | @echo "### needs to be run as root, and uses a hardware random" | 172 | @echo "### needs to be run as root, and uses a hardware random" |
| 183 | @echo "### number generator if one is available." | 173 | @echo "### number generator if one is available." |
| 184 | @echo "###" | 174 | @echo "###" |
| 185 | openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ | 175 | openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ |
| 186 | -x509 -config x509.genkey \ | 176 | -batch -x509 -config x509.genkey \ |
| 187 | -outform DER -out signing_key.x509 \ | 177 | -outform DER -out signing_key.x509 \ |
| 188 | -keyout signing_key.priv | 178 | -keyout signing_key.priv |
| 189 | @echo "###" | 179 | @echo "###" |
diff --git a/kernel/acct.c b/kernel/acct.c index 051e071a06e7..b9bd7f098ee5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname) | |||
| 205 | if (IS_ERR(file)) | 205 | if (IS_ERR(file)) |
| 206 | return PTR_ERR(file); | 206 | return PTR_ERR(file); |
| 207 | 207 | ||
| 208 | if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { | 208 | if (!S_ISREG(file_inode(file)->i_mode)) { |
| 209 | filp_close(file, NULL); | 209 | filp_close(file, NULL); |
| 210 | return -EACCES; | 210 | return -EACCES; |
| 211 | } | 211 | } |
| @@ -566,6 +566,7 @@ out: | |||
| 566 | void acct_collect(long exitcode, int group_dead) | 566 | void acct_collect(long exitcode, int group_dead) |
| 567 | { | 567 | { |
| 568 | struct pacct_struct *pacct = ¤t->signal->pacct; | 568 | struct pacct_struct *pacct = ¤t->signal->pacct; |
| 569 | cputime_t utime, stime; | ||
| 569 | unsigned long vsize = 0; | 570 | unsigned long vsize = 0; |
| 570 | 571 | ||
| 571 | if (group_dead && current->mm) { | 572 | if (group_dead && current->mm) { |
| @@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead) | |||
| 593 | pacct->ac_flag |= ACORE; | 594 | pacct->ac_flag |= ACORE; |
| 594 | if (current->flags & PF_SIGNALED) | 595 | if (current->flags & PF_SIGNALED) |
| 595 | pacct->ac_flag |= AXSIG; | 596 | pacct->ac_flag |= AXSIG; |
| 596 | pacct->ac_utime += current->utime; | 597 | task_cputime(current, &utime, &stime); |
| 597 | pacct->ac_stime += current->stime; | 598 | pacct->ac_utime += utime; |
| 599 | pacct->ac_stime += stime; | ||
| 598 | pacct->ac_minflt += current->min_flt; | 600 | pacct->ac_minflt += current->min_flt; |
| 599 | pacct->ac_majflt += current->maj_flt; | 601 | pacct->ac_majflt += current->maj_flt; |
| 600 | spin_unlock_irq(¤t->sighand->siglock); | 602 | spin_unlock_irq(¤t->sighand->siglock); |
diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..8ddee2c3e5b0 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel. | |||
| 57 | #include <linux/slab.h> | 57 | #include <linux/slab.h> |
| 58 | #include <linux/workqueue.h> | 58 | #include <linux/workqueue.h> |
| 59 | 59 | ||
| 60 | #include "workqueue_internal.h" | ||
| 61 | |||
| 60 | static async_cookie_t next_cookie = 1; | 62 | static async_cookie_t next_cookie = 1; |
| 61 | 63 | ||
| 62 | #define MAX_WORK 32768 | 64 | #define MAX_WORK 32768 |
| 65 | #define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ | ||
| 63 | 66 | ||
| 64 | static LIST_HEAD(async_pending); | 67 | static LIST_HEAD(async_global_pending); /* pending from all registered doms */ |
| 65 | static ASYNC_DOMAIN(async_running); | 68 | static ASYNC_DOMAIN(async_dfl_domain); |
| 66 | static LIST_HEAD(async_domains); | ||
| 67 | static DEFINE_SPINLOCK(async_lock); | 69 | static DEFINE_SPINLOCK(async_lock); |
| 68 | static DEFINE_MUTEX(async_register_mutex); | ||
| 69 | 70 | ||
| 70 | struct async_entry { | 71 | struct async_entry { |
| 71 | struct list_head list; | 72 | struct list_head domain_list; |
| 73 | struct list_head global_list; | ||
| 72 | struct work_struct work; | 74 | struct work_struct work; |
| 73 | async_cookie_t cookie; | 75 | async_cookie_t cookie; |
| 74 | async_func_ptr *func; | 76 | async_func_ptr *func; |
| 75 | void *data; | 77 | void *data; |
| 76 | struct async_domain *running; | 78 | struct async_domain *domain; |
| 77 | }; | 79 | }; |
| 78 | 80 | ||
| 79 | static DECLARE_WAIT_QUEUE_HEAD(async_done); | 81 | static DECLARE_WAIT_QUEUE_HEAD(async_done); |
| 80 | 82 | ||
| 81 | static atomic_t entry_count; | 83 | static atomic_t entry_count; |
| 82 | 84 | ||
| 83 | 85 | static async_cookie_t lowest_in_progress(struct async_domain *domain) | |
| 84 | /* | ||
| 85 | * MUST be called with the lock held! | ||
| 86 | */ | ||
| 87 | static async_cookie_t __lowest_in_progress(struct async_domain *running) | ||
| 88 | { | 86 | { |
| 89 | struct async_entry *entry; | 87 | struct async_entry *first = NULL; |
| 90 | 88 | async_cookie_t ret = ASYNC_COOKIE_MAX; | |
| 91 | if (!list_empty(&running->domain)) { | 89 | unsigned long flags; |
| 92 | entry = list_first_entry(&running->domain, typeof(*entry), list); | ||
| 93 | return entry->cookie; | ||
| 94 | } | ||
| 95 | 90 | ||
| 96 | list_for_each_entry(entry, &async_pending, list) | 91 | spin_lock_irqsave(&async_lock, flags); |
| 97 | if (entry->running == running) | ||
| 98 | return entry->cookie; | ||
| 99 | 92 | ||
| 100 | return next_cookie; /* "infinity" value */ | 93 | if (domain) { |
| 101 | } | 94 | if (!list_empty(&domain->pending)) |
| 95 | first = list_first_entry(&domain->pending, | ||
| 96 | struct async_entry, domain_list); | ||
| 97 | } else { | ||
| 98 | if (!list_empty(&async_global_pending)) | ||
| 99 | first = list_first_entry(&async_global_pending, | ||
| 100 | struct async_entry, global_list); | ||
| 101 | } | ||
| 102 | 102 | ||
| 103 | static async_cookie_t lowest_in_progress(struct async_domain *running) | 103 | if (first) |
| 104 | { | 104 | ret = first->cookie; |
| 105 | unsigned long flags; | ||
| 106 | async_cookie_t ret; | ||
| 107 | 105 | ||
| 108 | spin_lock_irqsave(&async_lock, flags); | ||
| 109 | ret = __lowest_in_progress(running); | ||
| 110 | spin_unlock_irqrestore(&async_lock, flags); | 106 | spin_unlock_irqrestore(&async_lock, flags); |
| 111 | return ret; | 107 | return ret; |
| 112 | } | 108 | } |
| @@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 120 | container_of(work, struct async_entry, work); | 116 | container_of(work, struct async_entry, work); |
| 121 | unsigned long flags; | 117 | unsigned long flags; |
| 122 | ktime_t uninitialized_var(calltime), delta, rettime; | 118 | ktime_t uninitialized_var(calltime), delta, rettime; |
| 123 | struct async_domain *running = entry->running; | ||
| 124 | 119 | ||
| 125 | /* 1) move self to the running queue */ | 120 | /* 1) run (and print duration) */ |
| 126 | spin_lock_irqsave(&async_lock, flags); | ||
| 127 | list_move_tail(&entry->list, &running->domain); | ||
| 128 | spin_unlock_irqrestore(&async_lock, flags); | ||
| 129 | |||
| 130 | /* 2) run (and print duration) */ | ||
| 131 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 121 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 132 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", | 122 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", |
| 133 | (long long)entry->cookie, | 123 | (long long)entry->cookie, |
| @@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 144 | (long long)ktime_to_ns(delta) >> 10); | 134 | (long long)ktime_to_ns(delta) >> 10); |
| 145 | } | 135 | } |
| 146 | 136 | ||
| 147 | /* 3) remove self from the running queue */ | 137 | /* 2) remove self from the pending queues */ |
| 148 | spin_lock_irqsave(&async_lock, flags); | 138 | spin_lock_irqsave(&async_lock, flags); |
| 149 | list_del(&entry->list); | 139 | list_del_init(&entry->domain_list); |
| 150 | if (running->registered && --running->count == 0) | 140 | list_del_init(&entry->global_list); |
| 151 | list_del_init(&running->node); | ||
| 152 | 141 | ||
| 153 | /* 4) free the entry */ | 142 | /* 3) free the entry */ |
| 154 | kfree(entry); | 143 | kfree(entry); |
| 155 | atomic_dec(&entry_count); | 144 | atomic_dec(&entry_count); |
| 156 | 145 | ||
| 157 | spin_unlock_irqrestore(&async_lock, flags); | 146 | spin_unlock_irqrestore(&async_lock, flags); |
| 158 | 147 | ||
| 159 | /* 5) wake up any waiters */ | 148 | /* 4) wake up any waiters */ |
| 160 | wake_up(&async_done); | 149 | wake_up(&async_done); |
| 161 | } | 150 | } |
| 162 | 151 | ||
| 163 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) | 152 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) |
| 164 | { | 153 | { |
| 165 | struct async_entry *entry; | 154 | struct async_entry *entry; |
| 166 | unsigned long flags; | 155 | unsigned long flags; |
| @@ -183,19 +172,28 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a | |||
| 183 | ptr(data, newcookie); | 172 | ptr(data, newcookie); |
| 184 | return newcookie; | 173 | return newcookie; |
| 185 | } | 174 | } |
| 175 | INIT_LIST_HEAD(&entry->domain_list); | ||
| 176 | INIT_LIST_HEAD(&entry->global_list); | ||
| 186 | INIT_WORK(&entry->work, async_run_entry_fn); | 177 | INIT_WORK(&entry->work, async_run_entry_fn); |
| 187 | entry->func = ptr; | 178 | entry->func = ptr; |
| 188 | entry->data = data; | 179 | entry->data = data; |
| 189 | entry->running = running; | 180 | entry->domain = domain; |
| 190 | 181 | ||
| 191 | spin_lock_irqsave(&async_lock, flags); | 182 | spin_lock_irqsave(&async_lock, flags); |
| 183 | |||
| 184 | /* allocate cookie and queue */ | ||
| 192 | newcookie = entry->cookie = next_cookie++; | 185 | newcookie = entry->cookie = next_cookie++; |
| 193 | list_add_tail(&entry->list, &async_pending); | 186 | |
| 194 | if (running->registered && running->count++ == 0) | 187 | list_add_tail(&entry->domain_list, &domain->pending); |
| 195 | list_add_tail(&running->node, &async_domains); | 188 | if (domain->registered) |
| 189 | list_add_tail(&entry->global_list, &async_global_pending); | ||
| 190 | |||
| 196 | atomic_inc(&entry_count); | 191 | atomic_inc(&entry_count); |
| 197 | spin_unlock_irqrestore(&async_lock, flags); | 192 | spin_unlock_irqrestore(&async_lock, flags); |
| 198 | 193 | ||
| 194 | /* mark that this task has queued an async job, used by module init */ | ||
| 195 | current->flags |= PF_USED_ASYNC; | ||
| 196 | |||
| 199 | /* schedule for execution */ | 197 | /* schedule for execution */ |
| 200 | queue_work(system_unbound_wq, &entry->work); | 198 | queue_work(system_unbound_wq, &entry->work); |
| 201 | 199 | ||
| @@ -212,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a | |||
| 212 | */ | 210 | */ |
| 213 | async_cookie_t async_schedule(async_func_ptr *ptr, void *data) | 211 | async_cookie_t async_schedule(async_func_ptr *ptr, void *data) |
| 214 | { | 212 | { |
| 215 | return __async_schedule(ptr, data, &async_running); | 213 | return __async_schedule(ptr, data, &async_dfl_domain); |
| 216 | } | 214 | } |
| 217 | EXPORT_SYMBOL_GPL(async_schedule); | 215 | EXPORT_SYMBOL_GPL(async_schedule); |
| 218 | 216 | ||
| @@ -220,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule); | |||
| 220 | * async_schedule_domain - schedule a function for asynchronous execution within a certain domain | 218 | * async_schedule_domain - schedule a function for asynchronous execution within a certain domain |
| 221 | * @ptr: function to execute asynchronously | 219 | * @ptr: function to execute asynchronously |
| 222 | * @data: data pointer to pass to the function | 220 | * @data: data pointer to pass to the function |
| 223 | * @running: running list for the domain | 221 | * @domain: the domain |
| 224 | * | 222 | * |
| 225 | * Returns an async_cookie_t that may be used for checkpointing later. | 223 | * Returns an async_cookie_t that may be used for checkpointing later. |
| 226 | * @running may be used in the async_synchronize_*_domain() functions | 224 | * @domain may be used in the async_synchronize_*_domain() functions to |
| 227 | * to wait within a certain synchronization domain rather than globally. | 225 | * wait within a certain synchronization domain rather than globally. A |
| 228 | * A synchronization domain is specified via the running queue @running to use. | 226 | * synchronization domain is specified via @domain. Note: This function |
| 229 | * Note: This function may be called from atomic or non-atomic contexts. | 227 | * may be called from atomic or non-atomic contexts. |
| 230 | */ | 228 | */ |
| 231 | async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, | 229 | async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, |
| 232 | struct async_domain *running) | 230 | struct async_domain *domain) |
| 233 | { | 231 | { |
| 234 | return __async_schedule(ptr, data, running); | 232 | return __async_schedule(ptr, data, domain); |
| 235 | } | 233 | } |
| 236 | EXPORT_SYMBOL_GPL(async_schedule_domain); | 234 | EXPORT_SYMBOL_GPL(async_schedule_domain); |
| 237 | 235 | ||
| @@ -242,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); | |||
| 242 | */ | 240 | */ |
| 243 | void async_synchronize_full(void) | 241 | void async_synchronize_full(void) |
| 244 | { | 242 | { |
| 245 | mutex_lock(&async_register_mutex); | 243 | async_synchronize_full_domain(NULL); |
| 246 | do { | ||
| 247 | struct async_domain *domain = NULL; | ||
| 248 | |||
| 249 | spin_lock_irq(&async_lock); | ||
| 250 | if (!list_empty(&async_domains)) | ||
| 251 | domain = list_first_entry(&async_domains, typeof(*domain), node); | ||
| 252 | spin_unlock_irq(&async_lock); | ||
| 253 | |||
| 254 | async_synchronize_cookie_domain(next_cookie, domain); | ||
| 255 | } while (!list_empty(&async_domains)); | ||
| 256 | mutex_unlock(&async_register_mutex); | ||
| 257 | } | 244 | } |
| 258 | EXPORT_SYMBOL_GPL(async_synchronize_full); | 245 | EXPORT_SYMBOL_GPL(async_synchronize_full); |
| 259 | 246 | ||
| @@ -268,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full); | |||
| 268 | */ | 255 | */ |
| 269 | void async_unregister_domain(struct async_domain *domain) | 256 | void async_unregister_domain(struct async_domain *domain) |
| 270 | { | 257 | { |
| 271 | mutex_lock(&async_register_mutex); | ||
| 272 | spin_lock_irq(&async_lock); | 258 | spin_lock_irq(&async_lock); |
| 273 | WARN_ON(!domain->registered || !list_empty(&domain->node) || | 259 | WARN_ON(!domain->registered || !list_empty(&domain->pending)); |
| 274 | !list_empty(&domain->domain)); | ||
| 275 | domain->registered = 0; | 260 | domain->registered = 0; |
| 276 | spin_unlock_irq(&async_lock); | 261 | spin_unlock_irq(&async_lock); |
| 277 | mutex_unlock(&async_register_mutex); | ||
| 278 | } | 262 | } |
| 279 | EXPORT_SYMBOL_GPL(async_unregister_domain); | 263 | EXPORT_SYMBOL_GPL(async_unregister_domain); |
| 280 | 264 | ||
| 281 | /** | 265 | /** |
| 282 | * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain | 266 | * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain |
| 283 | * @domain: running list to synchronize on | 267 | * @domain: the domain to synchronize |
| 284 | * | 268 | * |
| 285 | * This function waits until all asynchronous function calls for the | 269 | * This function waits until all asynchronous function calls for the |
| 286 | * synchronization domain specified by the running list @domain have been done. | 270 | * synchronization domain specified by @domain have been done. |
| 287 | */ | 271 | */ |
| 288 | void async_synchronize_full_domain(struct async_domain *domain) | 272 | void async_synchronize_full_domain(struct async_domain *domain) |
| 289 | { | 273 | { |
| 290 | async_synchronize_cookie_domain(next_cookie, domain); | 274 | async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); |
| 291 | } | 275 | } |
| 292 | EXPORT_SYMBOL_GPL(async_synchronize_full_domain); | 276 | EXPORT_SYMBOL_GPL(async_synchronize_full_domain); |
| 293 | 277 | ||
| 294 | /** | 278 | /** |
| 295 | * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing | 279 | * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing |
| 296 | * @cookie: async_cookie_t to use as checkpoint | 280 | * @cookie: async_cookie_t to use as checkpoint |
| 297 | * @running: running list to synchronize on | 281 | * @domain: the domain to synchronize (%NULL for all registered domains) |
| 298 | * | 282 | * |
| 299 | * This function waits until all asynchronous function calls for the | 283 | * This function waits until all asynchronous function calls for the |
| 300 | * synchronization domain specified by running list @running submitted | 284 | * synchronization domain specified by @domain submitted prior to @cookie |
| 301 | * prior to @cookie have been done. | 285 | * have been done. |
| 302 | */ | 286 | */ |
| 303 | void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) | 287 | void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) |
| 304 | { | 288 | { |
| 305 | ktime_t uninitialized_var(starttime), delta, endtime; | 289 | ktime_t uninitialized_var(starttime), delta, endtime; |
| 306 | 290 | ||
| 307 | if (!running) | ||
| 308 | return; | ||
| 309 | |||
| 310 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 291 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 311 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 292 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); |
| 312 | starttime = ktime_get(); | 293 | starttime = ktime_get(); |
| 313 | } | 294 | } |
| 314 | 295 | ||
| 315 | wait_event(async_done, lowest_in_progress(running) >= cookie); | 296 | wait_event(async_done, lowest_in_progress(domain) >= cookie); |
| 316 | 297 | ||
| 317 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 298 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 318 | endtime = ktime_get(); | 299 | endtime = ktime_get(); |
| @@ -334,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); | |||
| 334 | */ | 315 | */ |
| 335 | void async_synchronize_cookie(async_cookie_t cookie) | 316 | void async_synchronize_cookie(async_cookie_t cookie) |
| 336 | { | 317 | { |
| 337 | async_synchronize_cookie_domain(cookie, &async_running); | 318 | async_synchronize_cookie_domain(cookie, &async_dfl_domain); |
| 338 | } | 319 | } |
| 339 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); | 320 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); |
| 321 | |||
| 322 | /** | ||
| 323 | * current_is_async - is %current an async worker task? | ||
| 324 | * | ||
| 325 | * Returns %true if %current is an async worker task. | ||
| 326 | */ | ||
| 327 | bool current_is_async(void) | ||
| 328 | { | ||
| 329 | struct worker *worker = current_wq_worker(); | ||
| 330 | |||
| 331 | return worker && worker->current_func == async_run_entry_fn; | ||
| 332 | } | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 40414e9143db..d596e5355f15 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
| 272 | int rc = 0; | 272 | int rc = 0; |
| 273 | 273 | ||
| 274 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 274 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
| 275 | if (unlikely(!ab)) | ||
| 276 | return rc; | ||
| 275 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, | 277 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, |
| 276 | old, from_kuid(&init_user_ns, loginuid), sessionid); | 278 | old, from_kuid(&init_user_ns, loginuid), sessionid); |
| 277 | if (sid) { | 279 | if (sid) { |
| @@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
| 619 | } | 621 | } |
| 620 | 622 | ||
| 621 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 623 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
| 624 | if (unlikely(!*ab)) | ||
| 625 | return rc; | ||
| 622 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", | 626 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", |
| 623 | task_tgid_vnr(current), | 627 | task_tgid_vnr(current), |
| 624 | from_kuid(&init_user_ns, current_uid()), | 628 | from_kuid(&init_user_ns, current_uid()), |
| @@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
| 1097 | } | 1101 | } |
| 1098 | } | 1102 | } |
| 1099 | 1103 | ||
| 1104 | /* | ||
| 1105 | * Wait for auditd to drain the queue a little | ||
| 1106 | */ | ||
| 1107 | static void wait_for_auditd(unsigned long sleep_time) | ||
| 1108 | { | ||
| 1109 | DECLARE_WAITQUEUE(wait, current); | ||
| 1110 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1111 | add_wait_queue(&audit_backlog_wait, &wait); | ||
| 1112 | |||
| 1113 | if (audit_backlog_limit && | ||
| 1114 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) | ||
| 1115 | schedule_timeout(sleep_time); | ||
| 1116 | |||
| 1117 | __set_current_state(TASK_RUNNING); | ||
| 1118 | remove_wait_queue(&audit_backlog_wait, &wait); | ||
| 1119 | } | ||
| 1120 | |||
| 1100 | /* Obtain an audit buffer. This routine does locking to obtain the | 1121 | /* Obtain an audit buffer. This routine does locking to obtain the |
| 1101 | * audit buffer, but then no locking is required for calls to | 1122 | * audit buffer, but then no locking is required for calls to |
| 1102 | * audit_log_*format. If the tsk is a task that is currently in a | 1123 | * audit_log_*format. If the tsk is a task that is currently in a |
| @@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
| 1142 | 1163 | ||
| 1143 | while (audit_backlog_limit | 1164 | while (audit_backlog_limit |
| 1144 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { | 1165 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { |
| 1145 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time | 1166 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { |
| 1146 | && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { | 1167 | unsigned long sleep_time; |
| 1147 | 1168 | ||
| 1148 | /* Wait for auditd to drain the queue a little */ | 1169 | sleep_time = timeout_start + audit_backlog_wait_time - |
| 1149 | DECLARE_WAITQUEUE(wait, current); | 1170 | jiffies; |
| 1150 | set_current_state(TASK_INTERRUPTIBLE); | 1171 | if ((long)sleep_time > 0) |
| 1151 | add_wait_queue(&audit_backlog_wait, &wait); | 1172 | wait_for_auditd(sleep_time); |
| 1152 | |||
| 1153 | if (audit_backlog_limit && | ||
| 1154 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) | ||
| 1155 | schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); | ||
| 1156 | |||
| 1157 | __set_current_state(TASK_RUNNING); | ||
| 1158 | remove_wait_queue(&audit_backlog_wait, &wait); | ||
| 1159 | continue; | 1173 | continue; |
| 1160 | } | 1174 | } |
| 1161 | if (audit_rate_check() && printk_ratelimit()) | 1175 | if (audit_rate_check() && printk_ratelimit()) |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e81175ef25f8..642a89c4f3d6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -449,11 +449,26 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 449 | return 0; | 449 | return 0; |
| 450 | } | 450 | } |
| 451 | 451 | ||
| 452 | static void audit_log_remove_rule(struct audit_krule *rule) | ||
| 453 | { | ||
| 454 | struct audit_buffer *ab; | ||
| 455 | |||
| 456 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
| 457 | if (unlikely(!ab)) | ||
| 458 | return; | ||
| 459 | audit_log_format(ab, "op="); | ||
| 460 | audit_log_string(ab, "remove rule"); | ||
| 461 | audit_log_format(ab, " dir="); | ||
| 462 | audit_log_untrustedstring(ab, rule->tree->pathname); | ||
| 463 | audit_log_key(ab, rule->filterkey); | ||
| 464 | audit_log_format(ab, " list=%d res=1", rule->listnr); | ||
| 465 | audit_log_end(ab); | ||
| 466 | } | ||
| 467 | |||
| 452 | static void kill_rules(struct audit_tree *tree) | 468 | static void kill_rules(struct audit_tree *tree) |
| 453 | { | 469 | { |
| 454 | struct audit_krule *rule, *next; | 470 | struct audit_krule *rule, *next; |
| 455 | struct audit_entry *entry; | 471 | struct audit_entry *entry; |
| 456 | struct audit_buffer *ab; | ||
| 457 | 472 | ||
| 458 | list_for_each_entry_safe(rule, next, &tree->rules, rlist) { | 473 | list_for_each_entry_safe(rule, next, &tree->rules, rlist) { |
| 459 | entry = container_of(rule, struct audit_entry, rule); | 474 | entry = container_of(rule, struct audit_entry, rule); |
| @@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree) | |||
| 461 | list_del_init(&rule->rlist); | 476 | list_del_init(&rule->rlist); |
| 462 | if (rule->tree) { | 477 | if (rule->tree) { |
| 463 | /* not a half-baked one */ | 478 | /* not a half-baked one */ |
| 464 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 479 | audit_log_remove_rule(rule); |
| 465 | audit_log_format(ab, "op="); | ||
| 466 | audit_log_string(ab, "remove rule"); | ||
| 467 | audit_log_format(ab, " dir="); | ||
| 468 | audit_log_untrustedstring(ab, rule->tree->pathname); | ||
| 469 | audit_log_key(ab, rule->filterkey); | ||
| 470 | audit_log_format(ab, " list=%d res=1", rule->listnr); | ||
| 471 | audit_log_end(ab); | ||
| 472 | rule->tree = NULL; | 480 | rule->tree = NULL; |
| 473 | list_del_rcu(&entry->list); | 481 | list_del_rcu(&entry->list); |
| 474 | list_del(&entry->rule.list); | 482 | list_del(&entry->rule.list); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4a599f699adc..22831c4d369c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc | |||
| 240 | if (audit_enabled) { | 240 | if (audit_enabled) { |
| 241 | struct audit_buffer *ab; | 241 | struct audit_buffer *ab; |
| 242 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); | 242 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); |
| 243 | if (unlikely(!ab)) | ||
| 244 | return; | ||
| 243 | audit_log_format(ab, "auid=%u ses=%u op=", | 245 | audit_log_format(ab, "auid=%u ses=%u op=", |
| 244 | from_kuid(&init_user_ns, audit_get_loginuid(current)), | 246 | from_kuid(&init_user_ns, audit_get_loginuid(current)), |
| 245 | audit_get_sessionid(current)); | 247 | audit_get_sessionid(current)); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7f19f23d38a3..f9fc54bbe06f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, | |||
| 1144 | * audit_receive_filter - apply all rules to the specified message type | 1144 | * audit_receive_filter - apply all rules to the specified message type |
| 1145 | * @type: audit message type | 1145 | * @type: audit message type |
| 1146 | * @pid: target pid for netlink audit messages | 1146 | * @pid: target pid for netlink audit messages |
| 1147 | * @uid: target uid for netlink audit messages | ||
| 1148 | * @seq: netlink audit message sequence (serial) number | 1147 | * @seq: netlink audit message sequence (serial) number |
| 1149 | * @data: payload data | 1148 | * @data: payload data |
| 1150 | * @datasz: size of payload data | 1149 | * @datasz: size of payload data |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e37e6a12c5e3..a371f857a0a9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
| 1464 | audit_log_end(ab); | 1464 | audit_log_end(ab); |
| 1465 | ab = audit_log_start(context, GFP_KERNEL, | 1465 | ab = audit_log_start(context, GFP_KERNEL, |
| 1466 | AUDIT_IPC_SET_PERM); | 1466 | AUDIT_IPC_SET_PERM); |
| 1467 | if (unlikely(!ab)) | ||
| 1468 | return; | ||
| 1467 | audit_log_format(ab, | 1469 | audit_log_format(ab, |
| 1468 | "qbytes=%lx ouid=%u ogid=%u mode=%#ho", | 1470 | "qbytes=%lx ouid=%u ogid=%u mode=%#ho", |
| 1469 | context->ipc.qbytes, | 1471 | context->ipc.qbytes, |
| 1470 | context->ipc.perm_uid, | 1472 | context->ipc.perm_uid, |
| 1471 | context->ipc.perm_gid, | 1473 | context->ipc.perm_gid, |
| 1472 | context->ipc.perm_mode); | 1474 | context->ipc.perm_mode); |
| 1473 | if (!ab) | ||
| 1474 | return; | ||
| 1475 | } | 1475 | } |
| 1476 | break; } | 1476 | break; } |
| 1477 | case AUDIT_MQ_OPEN: { | 1477 | case AUDIT_MQ_OPEN: { |
| @@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags) | |||
| 2675 | context->type = AUDIT_MMAP; | 2675 | context->type = AUDIT_MMAP; |
| 2676 | } | 2676 | } |
| 2677 | 2677 | ||
| 2678 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | 2678 | static void audit_log_task(struct audit_buffer *ab) |
| 2679 | { | 2679 | { |
| 2680 | kuid_t auid, uid; | 2680 | kuid_t auid, uid; |
| 2681 | kgid_t gid; | 2681 | kgid_t gid; |
| @@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | |||
| 2693 | audit_log_task_context(ab); | 2693 | audit_log_task_context(ab); |
| 2694 | audit_log_format(ab, " pid=%d comm=", current->pid); | 2694 | audit_log_format(ab, " pid=%d comm=", current->pid); |
| 2695 | audit_log_untrustedstring(ab, current->comm); | 2695 | audit_log_untrustedstring(ab, current->comm); |
| 2696 | } | ||
| 2697 | |||
| 2698 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | ||
| 2699 | { | ||
| 2700 | audit_log_task(ab); | ||
| 2696 | audit_log_format(ab, " reason="); | 2701 | audit_log_format(ab, " reason="); |
| 2697 | audit_log_string(ab, reason); | 2702 | audit_log_string(ab, reason); |
| 2698 | audit_log_format(ab, " sig=%ld", signr); | 2703 | audit_log_format(ab, " sig=%ld", signr); |
| @@ -2715,6 +2720,8 @@ void audit_core_dumps(long signr) | |||
| 2715 | return; | 2720 | return; |
| 2716 | 2721 | ||
| 2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2722 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
| 2723 | if (unlikely(!ab)) | ||
| 2724 | return; | ||
| 2718 | audit_log_abend(ab, "memory violation", signr); | 2725 | audit_log_abend(ab, "memory violation", signr); |
| 2719 | audit_log_end(ab); | 2726 | audit_log_end(ab); |
| 2720 | } | 2727 | } |
| @@ -2723,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) | |||
| 2723 | { | 2730 | { |
| 2724 | struct audit_buffer *ab; | 2731 | struct audit_buffer *ab; |
| 2725 | 2732 | ||
| 2726 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2733 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); |
| 2727 | audit_log_abend(ab, "seccomp", signr); | 2734 | if (unlikely(!ab)) |
| 2735 | return; | ||
| 2736 | audit_log_task(ab); | ||
| 2737 | audit_log_format(ab, " sig=%ld", signr); | ||
| 2728 | audit_log_format(ab, " syscall=%ld", syscall); | 2738 | audit_log_format(ab, " syscall=%ld", syscall); |
| 2729 | audit_log_format(ab, " compat=%d", is_compat_task()); | 2739 | audit_log_format(ab, " compat=%d", is_compat_task()); |
| 2730 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | 2740 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798fd..a32f9432666c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -52,7 +52,7 @@ | |||
| 52 | #include <linux/module.h> | 52 | #include <linux/module.h> |
| 53 | #include <linux/delayacct.h> | 53 | #include <linux/delayacct.h> |
| 54 | #include <linux/cgroupstats.h> | 54 | #include <linux/cgroupstats.h> |
| 55 | #include <linux/hash.h> | 55 | #include <linux/hashtable.h> |
| 56 | #include <linux/namei.h> | 56 | #include <linux/namei.h> |
| 57 | #include <linux/pid_namespace.h> | 57 | #include <linux/pid_namespace.h> |
| 58 | #include <linux/idr.h> | 58 | #include <linux/idr.h> |
| @@ -376,22 +376,18 @@ static int css_set_count; | |||
| 376 | * account cgroups in empty hierarchies. | 376 | * account cgroups in empty hierarchies. |
| 377 | */ | 377 | */ |
| 378 | #define CSS_SET_HASH_BITS 7 | 378 | #define CSS_SET_HASH_BITS 7 |
| 379 | #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) | 379 | static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); |
| 380 | static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; | ||
| 381 | 380 | ||
| 382 | static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | 381 | static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) |
| 383 | { | 382 | { |
| 384 | int i; | 383 | int i; |
| 385 | int index; | 384 | unsigned long key = 0UL; |
| 386 | unsigned long tmp = 0UL; | ||
| 387 | 385 | ||
| 388 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | 386 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) |
| 389 | tmp += (unsigned long)css[i]; | 387 | key += (unsigned long)css[i]; |
| 390 | tmp = (tmp >> 16) ^ tmp; | 388 | key = (key >> 16) ^ key; |
| 391 | 389 | ||
| 392 | index = hash_long(tmp, CSS_SET_HASH_BITS); | 390 | return key; |
| 393 | |||
| 394 | return &css_set_table[index]; | ||
| 395 | } | 391 | } |
| 396 | 392 | ||
| 397 | /* We don't maintain the lists running through each css_set to its | 393 | /* We don't maintain the lists running through each css_set to its |
| @@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
| 418 | } | 414 | } |
| 419 | 415 | ||
| 420 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 416 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
| 421 | hlist_del(&cg->hlist); | 417 | hash_del(&cg->hlist); |
| 422 | css_set_count--; | 418 | css_set_count--; |
| 423 | 419 | ||
| 424 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | 420 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, |
| @@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
| 426 | struct cgroup *cgrp = link->cgrp; | 422 | struct cgroup *cgrp = link->cgrp; |
| 427 | list_del(&link->cg_link_list); | 423 | list_del(&link->cg_link_list); |
| 428 | list_del(&link->cgrp_link_list); | 424 | list_del(&link->cgrp_link_list); |
| 425 | |||
| 426 | /* | ||
| 427 | * We may not be holding cgroup_mutex, and if cgrp->count is | ||
| 428 | * dropped to 0 the cgroup can be destroyed at any time, hence | ||
| 429 | * rcu_read_lock is used to keep it alive. | ||
| 430 | */ | ||
| 431 | rcu_read_lock(); | ||
| 429 | if (atomic_dec_and_test(&cgrp->count) && | 432 | if (atomic_dec_and_test(&cgrp->count) && |
| 430 | notify_on_release(cgrp)) { | 433 | notify_on_release(cgrp)) { |
| 431 | if (taskexit) | 434 | if (taskexit) |
| 432 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 435 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
| 433 | check_for_release(cgrp); | 436 | check_for_release(cgrp); |
| 434 | } | 437 | } |
| 438 | rcu_read_unlock(); | ||
| 435 | 439 | ||
| 436 | kfree(link); | 440 | kfree(link); |
| 437 | } | 441 | } |
| @@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set( | |||
| 550 | { | 554 | { |
| 551 | int i; | 555 | int i; |
| 552 | struct cgroupfs_root *root = cgrp->root; | 556 | struct cgroupfs_root *root = cgrp->root; |
| 553 | struct hlist_head *hhead; | ||
| 554 | struct hlist_node *node; | ||
| 555 | struct css_set *cg; | 557 | struct css_set *cg; |
| 558 | unsigned long key; | ||
| 556 | 559 | ||
| 557 | /* | 560 | /* |
| 558 | * Build the set of subsystem state objects that we want to see in the | 561 | * Build the set of subsystem state objects that we want to see in the |
| @@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set( | |||
| 572 | } | 575 | } |
| 573 | } | 576 | } |
| 574 | 577 | ||
| 575 | hhead = css_set_hash(template); | 578 | key = css_set_hash(template); |
| 576 | hlist_for_each_entry(cg, node, hhead, hlist) { | 579 | hash_for_each_possible(css_set_table, cg, hlist, key) { |
| 577 | if (!compare_css_sets(cg, oldcg, cgrp, template)) | 580 | if (!compare_css_sets(cg, oldcg, cgrp, template)) |
| 578 | continue; | 581 | continue; |
| 579 | 582 | ||
| @@ -657,8 +660,8 @@ static struct css_set *find_css_set( | |||
| 657 | 660 | ||
| 658 | struct list_head tmp_cg_links; | 661 | struct list_head tmp_cg_links; |
| 659 | 662 | ||
| 660 | struct hlist_head *hhead; | ||
| 661 | struct cg_cgroup_link *link; | 663 | struct cg_cgroup_link *link; |
| 664 | unsigned long key; | ||
| 662 | 665 | ||
| 663 | /* First see if we already have a cgroup group that matches | 666 | /* First see if we already have a cgroup group that matches |
| 664 | * the desired set */ | 667 | * the desired set */ |
| @@ -704,8 +707,8 @@ static struct css_set *find_css_set( | |||
| 704 | css_set_count++; | 707 | css_set_count++; |
| 705 | 708 | ||
| 706 | /* Add this cgroup group to the hash table */ | 709 | /* Add this cgroup group to the hash table */ |
| 707 | hhead = css_set_hash(res->subsys); | 710 | key = css_set_hash(res->subsys); |
| 708 | hlist_add_head(&res->hlist, hhead); | 711 | hash_add(css_set_table, &res->hlist, key); |
| 709 | 712 | ||
| 710 | write_unlock(&css_set_lock); | 713 | write_unlock(&css_set_lock); |
| 711 | 714 | ||
| @@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
| 856 | return inode; | 859 | return inode; |
| 857 | } | 860 | } |
| 858 | 861 | ||
| 859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 862 | static void cgroup_free_fn(struct work_struct *work) |
| 860 | { | 863 | { |
| 861 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 864 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); |
| 862 | if (S_ISDIR(inode->i_mode)) { | 865 | struct cgroup_subsys *ss; |
| 863 | struct cgroup *cgrp = dentry->d_fsdata; | ||
| 864 | struct cgroup_subsys *ss; | ||
| 865 | BUG_ON(!(cgroup_is_removed(cgrp))); | ||
| 866 | /* It's possible for external users to be holding css | ||
| 867 | * reference counts on a cgroup; css_put() needs to | ||
| 868 | * be able to access the cgroup after decrementing | ||
| 869 | * the reference count in order to know if it needs to | ||
| 870 | * queue the cgroup to be handled by the release | ||
| 871 | * agent */ | ||
| 872 | synchronize_rcu(); | ||
| 873 | 866 | ||
| 874 | mutex_lock(&cgroup_mutex); | 867 | mutex_lock(&cgroup_mutex); |
| 875 | /* | 868 | /* |
| 876 | * Release the subsystem state objects. | 869 | * Release the subsystem state objects. |
| 877 | */ | 870 | */ |
| 878 | for_each_subsys(cgrp->root, ss) | 871 | for_each_subsys(cgrp->root, ss) |
| 879 | ss->css_free(cgrp); | 872 | ss->css_free(cgrp); |
| 880 | 873 | ||
| 881 | cgrp->root->number_of_cgroups--; | 874 | cgrp->root->number_of_cgroups--; |
| 882 | mutex_unlock(&cgroup_mutex); | 875 | mutex_unlock(&cgroup_mutex); |
| 883 | 876 | ||
| 884 | /* | 877 | /* |
| 885 | * Drop the active superblock reference that we took when we | 878 | * Drop the active superblock reference that we took when we |
| 886 | * created the cgroup | 879 | * created the cgroup |
| 887 | */ | 880 | */ |
| 888 | deactivate_super(cgrp->root->sb); | 881 | deactivate_super(cgrp->root->sb); |
| 889 | 882 | ||
| 890 | /* | 883 | /* |
| 891 | * if we're getting rid of the cgroup, refcount should ensure | 884 | * if we're getting rid of the cgroup, refcount should ensure |
| 892 | * that there are no pidlists left. | 885 | * that there are no pidlists left. |
| 893 | */ | 886 | */ |
| 894 | BUG_ON(!list_empty(&cgrp->pidlists)); | 887 | BUG_ON(!list_empty(&cgrp->pidlists)); |
| 888 | |||
| 889 | simple_xattrs_free(&cgrp->xattrs); | ||
| 890 | |||
| 891 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
| 892 | kfree(cgrp); | ||
| 893 | } | ||
| 895 | 894 | ||
| 896 | simple_xattrs_free(&cgrp->xattrs); | 895 | static void cgroup_free_rcu(struct rcu_head *head) |
| 896 | { | ||
| 897 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | ||
| 898 | |||
| 899 | schedule_work(&cgrp->free_work); | ||
| 900 | } | ||
| 901 | |||
| 902 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | ||
| 903 | { | ||
| 904 | /* is dentry a directory ? if so, kfree() associated cgroup */ | ||
| 905 | if (S_ISDIR(inode->i_mode)) { | ||
| 906 | struct cgroup *cgrp = dentry->d_fsdata; | ||
| 897 | 907 | ||
| 898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | 908 | BUG_ON(!(cgroup_is_removed(cgrp))); |
| 899 | kfree_rcu(cgrp, rcu_head); | 909 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
| 900 | } else { | 910 | } else { |
| 901 | struct cfent *cfe = __d_cfe(dentry); | 911 | struct cfent *cfe = __d_cfe(dentry); |
| 902 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | 912 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; |
| @@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d) | |||
| 925 | dput(parent); | 935 | dput(parent); |
| 926 | } | 936 | } |
| 927 | 937 | ||
| 928 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | 938 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
| 929 | { | 939 | { |
| 930 | struct cfent *cfe; | 940 | struct cfent *cfe; |
| 931 | 941 | ||
| 932 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | 942 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
| 933 | lockdep_assert_held(&cgroup_mutex); | 943 | lockdep_assert_held(&cgroup_mutex); |
| 934 | 944 | ||
| 945 | /* | ||
| 946 | * If we're doing cleanup due to failure of cgroup_create(), | ||
| 947 | * the corresponding @cfe may not exist. | ||
| 948 | */ | ||
| 935 | list_for_each_entry(cfe, &cgrp->files, node) { | 949 | list_for_each_entry(cfe, &cgrp->files, node) { |
| 936 | struct dentry *d = cfe->dentry; | 950 | struct dentry *d = cfe->dentry; |
| 937 | 951 | ||
| @@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
| 944 | list_del_init(&cfe->node); | 958 | list_del_init(&cfe->node); |
| 945 | dput(d); | 959 | dput(d); |
| 946 | 960 | ||
| 947 | return 0; | 961 | break; |
| 948 | } | 962 | } |
| 949 | return -ENOENT; | ||
| 950 | } | 963 | } |
| 951 | 964 | ||
| 952 | /** | 965 | /** |
| @@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1083 | } | 1096 | } |
| 1084 | } | 1097 | } |
| 1085 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; | 1098 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; |
| 1086 | synchronize_rcu(); | ||
| 1087 | 1099 | ||
| 1088 | return 0; | 1100 | return 0; |
| 1089 | } | 1101 | } |
| @@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1393 | INIT_LIST_HEAD(&cgrp->allcg_node); | 1405 | INIT_LIST_HEAD(&cgrp->allcg_node); |
| 1394 | INIT_LIST_HEAD(&cgrp->release_list); | 1406 | INIT_LIST_HEAD(&cgrp->release_list); |
| 1395 | INIT_LIST_HEAD(&cgrp->pidlists); | 1407 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1408 | INIT_WORK(&cgrp->free_work, cgroup_free_fn); | ||
| 1396 | mutex_init(&cgrp->pidlist_mutex); | 1409 | mutex_init(&cgrp->pidlist_mutex); |
| 1397 | INIT_LIST_HEAD(&cgrp->event_list); | 1410 | INIT_LIST_HEAD(&cgrp->event_list); |
| 1398 | spin_lock_init(&cgrp->event_list_lock); | 1411 | spin_lock_init(&cgrp->event_list_lock); |
| @@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1597 | struct cgroupfs_root *existing_root; | 1610 | struct cgroupfs_root *existing_root; |
| 1598 | const struct cred *cred; | 1611 | const struct cred *cred; |
| 1599 | int i; | 1612 | int i; |
| 1613 | struct css_set *cg; | ||
| 1600 | 1614 | ||
| 1601 | BUG_ON(sb->s_root != NULL); | 1615 | BUG_ON(sb->s_root != NULL); |
| 1602 | 1616 | ||
| @@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1650 | /* Link the top cgroup in this hierarchy into all | 1664 | /* Link the top cgroup in this hierarchy into all |
| 1651 | * the css_set objects */ | 1665 | * the css_set objects */ |
| 1652 | write_lock(&css_set_lock); | 1666 | write_lock(&css_set_lock); |
| 1653 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | 1667 | hash_for_each(css_set_table, i, cg, hlist) |
| 1654 | struct hlist_head *hhead = &css_set_table[i]; | 1668 | link_css_set(&tmp_cg_links, cg, root_cgrp); |
| 1655 | struct hlist_node *node; | ||
| 1656 | struct css_set *cg; | ||
| 1657 | |||
| 1658 | hlist_for_each_entry(cg, node, hhead, hlist) | ||
| 1659 | link_css_set(&tmp_cg_links, cg, root_cgrp); | ||
| 1660 | } | ||
| 1661 | write_unlock(&css_set_lock); | 1669 | write_unlock(&css_set_lock); |
| 1662 | 1670 | ||
| 1663 | free_cg_links(&tmp_cg_links); | 1671 | free_cg_links(&tmp_cg_links); |
| @@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
| 1773 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), | 1781 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
| 1774 | "cgroup_path() called without proper locking"); | 1782 | "cgroup_path() called without proper locking"); |
| 1775 | 1783 | ||
| 1776 | if (!dentry || cgrp == dummytop) { | 1784 | if (cgrp == dummytop) { |
| 1777 | /* | 1785 | /* |
| 1778 | * Inactive subsystems have no dentry for their root | 1786 | * Inactive subsystems have no dentry for their root |
| 1779 | * cgroup | 1787 | * cgroup |
| @@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1982 | ss->attach(cgrp, &tset); | 1990 | ss->attach(cgrp, &tset); |
| 1983 | } | 1991 | } |
| 1984 | 1992 | ||
| 1985 | synchronize_rcu(); | ||
| 1986 | out: | 1993 | out: |
| 1987 | if (retval) { | 1994 | if (retval) { |
| 1988 | for_each_subsys(root, ss) { | 1995 | for_each_subsys(root, ss) { |
| @@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
| 2151 | /* | 2158 | /* |
| 2152 | * step 5: success! and cleanup | 2159 | * step 5: success! and cleanup |
| 2153 | */ | 2160 | */ |
| 2154 | synchronize_rcu(); | ||
| 2155 | retval = 0; | 2161 | retval = 0; |
| 2156 | out_put_css_set_refs: | 2162 | out_put_css_set_refs: |
| 2157 | if (retval) { | 2163 | if (retval) { |
| @@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un | |||
| 2637 | */ | 2643 | */ |
| 2638 | static inline struct cftype *__file_cft(struct file *file) | 2644 | static inline struct cftype *__file_cft(struct file *file) |
| 2639 | { | 2645 | { |
| 2640 | if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) | 2646 | if (file_inode(file)->i_fop != &cgroup_file_operations) |
| 2641 | return ERR_PTR(-EINVAL); | 2647 | return ERR_PTR(-EINVAL); |
| 2642 | return __d_cft(file->f_dentry); | 2648 | return __d_cft(file->f_dentry); |
| 2643 | } | 2649 | } |
| @@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
| 2769 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | 2775 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) |
| 2770 | continue; | 2776 | continue; |
| 2771 | 2777 | ||
| 2772 | if (is_add) | 2778 | if (is_add) { |
| 2773 | err = cgroup_add_file(cgrp, subsys, cft); | 2779 | err = cgroup_add_file(cgrp, subsys, cft); |
| 2774 | else | 2780 | if (err) |
| 2775 | err = cgroup_rm_file(cgrp, cft); | 2781 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", |
| 2776 | if (err) { | 2782 | cft->name, err); |
| 2777 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
| 2778 | is_add ? "add" : "remove", cft->name, err); | ||
| 2779 | ret = err; | 2783 | ret = err; |
| 2784 | } else { | ||
| 2785 | cgroup_rm_file(cgrp, cft); | ||
| 2780 | } | 2786 | } |
| 2781 | } | 2787 | } |
| 2782 | return ret; | 2788 | return ret; |
| @@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | |||
| 3017 | } | 3023 | } |
| 3018 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | 3024 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); |
| 3019 | 3025 | ||
| 3026 | /** | ||
| 3027 | * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup | ||
| 3028 | * @pos: cgroup of interest | ||
| 3029 | * | ||
| 3030 | * Return the rightmost descendant of @pos. If there's no descendant, | ||
| 3031 | * @pos is returned. This can be used during pre-order traversal to skip | ||
| 3032 | * subtree of @pos. | ||
| 3033 | */ | ||
| 3034 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | ||
| 3035 | { | ||
| 3036 | struct cgroup *last, *tmp; | ||
| 3037 | |||
| 3038 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 3039 | |||
| 3040 | do { | ||
| 3041 | last = pos; | ||
| 3042 | /* ->prev isn't RCU safe, walk ->next till the end */ | ||
| 3043 | pos = NULL; | ||
| 3044 | list_for_each_entry_rcu(tmp, &last->children, sibling) | ||
| 3045 | pos = tmp; | ||
| 3046 | } while (pos); | ||
| 3047 | |||
| 3048 | return last; | ||
| 3049 | } | ||
| 3050 | EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); | ||
| 3051 | |||
| 3020 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | 3052 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) |
| 3021 | { | 3053 | { |
| 3022 | struct cgroup *last; | 3054 | struct cgroup *last; |
| @@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work) | |||
| 3752 | remove); | 3784 | remove); |
| 3753 | struct cgroup *cgrp = event->cgrp; | 3785 | struct cgroup *cgrp = event->cgrp; |
| 3754 | 3786 | ||
| 3787 | remove_wait_queue(event->wqh, &event->wait); | ||
| 3788 | |||
| 3755 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3789 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); |
| 3756 | 3790 | ||
| 3791 | /* Notify userspace the event is going away. */ | ||
| 3792 | eventfd_signal(event->eventfd, 1); | ||
| 3793 | |||
| 3757 | eventfd_ctx_put(event->eventfd); | 3794 | eventfd_ctx_put(event->eventfd); |
| 3758 | kfree(event); | 3795 | kfree(event); |
| 3759 | dput(cgrp->dentry); | 3796 | dput(cgrp->dentry); |
| @@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
| 3773 | unsigned long flags = (unsigned long)key; | 3810 | unsigned long flags = (unsigned long)key; |
| 3774 | 3811 | ||
| 3775 | if (flags & POLLHUP) { | 3812 | if (flags & POLLHUP) { |
| 3776 | __remove_wait_queue(event->wqh, &event->wait); | ||
| 3777 | spin_lock(&cgrp->event_list_lock); | ||
| 3778 | list_del_init(&event->list); | ||
| 3779 | spin_unlock(&cgrp->event_list_lock); | ||
| 3780 | /* | 3813 | /* |
| 3781 | * We are in atomic context, but cgroup_event_remove() may | 3814 | * If the event has been detached at cgroup removal, we |
| 3782 | * sleep, so we have to call it in workqueue. | 3815 | * can simply return knowing the other side will cleanup |
| 3816 | * for us. | ||
| 3817 | * | ||
| 3818 | * We can't race against event freeing since the other | ||
| 3819 | * side will require wqh->lock via remove_wait_queue(), | ||
| 3820 | * which we hold. | ||
| 3783 | */ | 3821 | */ |
| 3784 | schedule_work(&event->remove); | 3822 | spin_lock(&cgrp->event_list_lock); |
| 3823 | if (!list_empty(&event->list)) { | ||
| 3824 | list_del_init(&event->list); | ||
| 3825 | /* | ||
| 3826 | * We are in atomic context, but cgroup_event_remove() | ||
| 3827 | * may sleep, so we have to call it in workqueue. | ||
| 3828 | */ | ||
| 3829 | schedule_work(&event->remove); | ||
| 3830 | } | ||
| 3831 | spin_unlock(&cgrp->event_list_lock); | ||
| 3785 | } | 3832 | } |
| 3786 | 3833 | ||
| 3787 | return 0; | 3834 | return 0; |
| @@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
| 3807 | const char *buffer) | 3854 | const char *buffer) |
| 3808 | { | 3855 | { |
| 3809 | struct cgroup_event *event = NULL; | 3856 | struct cgroup_event *event = NULL; |
| 3857 | struct cgroup *cgrp_cfile; | ||
| 3810 | unsigned int efd, cfd; | 3858 | unsigned int efd, cfd; |
| 3811 | struct file *efile = NULL; | 3859 | struct file *efile = NULL; |
| 3812 | struct file *cfile = NULL; | 3860 | struct file *cfile = NULL; |
| @@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
| 3852 | 3900 | ||
| 3853 | /* the process need read permission on control file */ | 3901 | /* the process need read permission on control file */ |
| 3854 | /* AV: shouldn't we check that it's been opened for read instead? */ | 3902 | /* AV: shouldn't we check that it's been opened for read instead? */ |
| 3855 | ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); | 3903 | ret = inode_permission(file_inode(cfile), MAY_READ); |
| 3856 | if (ret < 0) | 3904 | if (ret < 0) |
| 3857 | goto fail; | 3905 | goto fail; |
| 3858 | 3906 | ||
| @@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
| 3862 | goto fail; | 3910 | goto fail; |
| 3863 | } | 3911 | } |
| 3864 | 3912 | ||
| 3913 | /* | ||
| 3914 | * The file to be monitored must be in the same cgroup as | ||
| 3915 | * cgroup.event_control is. | ||
| 3916 | */ | ||
| 3917 | cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); | ||
| 3918 | if (cgrp_cfile != cgrp) { | ||
| 3919 | ret = -EINVAL; | ||
| 3920 | goto fail; | ||
| 3921 | } | ||
| 3922 | |||
| 3865 | if (!event->cft->register_event || !event->cft->unregister_event) { | 3923 | if (!event->cft->register_event || !event->cft->unregister_event) { |
| 3866 | ret = -EINVAL; | 3924 | ret = -EINVAL; |
| 3867 | goto fail; | 3925 | goto fail; |
| @@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4135 | 4193 | ||
| 4136 | init_cgroup_housekeeping(cgrp); | 4194 | init_cgroup_housekeeping(cgrp); |
| 4137 | 4195 | ||
| 4196 | dentry->d_fsdata = cgrp; | ||
| 4197 | cgrp->dentry = dentry; | ||
| 4198 | |||
| 4138 | cgrp->parent = parent; | 4199 | cgrp->parent = parent; |
| 4139 | cgrp->root = parent->root; | 4200 | cgrp->root = parent->root; |
| 4140 | cgrp->top_cgroup = parent->top_cgroup; | 4201 | cgrp->top_cgroup = parent->top_cgroup; |
| @@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4172 | lockdep_assert_held(&dentry->d_inode->i_mutex); | 4233 | lockdep_assert_held(&dentry->d_inode->i_mutex); |
| 4173 | 4234 | ||
| 4174 | /* allocation complete, commit to creation */ | 4235 | /* allocation complete, commit to creation */ |
| 4175 | dentry->d_fsdata = cgrp; | ||
| 4176 | cgrp->dentry = dentry; | ||
| 4177 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 4236 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); |
| 4178 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4237 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
| 4179 | root->number_of_cgroups++; | 4238 | root->number_of_cgroups++; |
| @@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4340 | /* | 4399 | /* |
| 4341 | * Unregister events and notify userspace. | 4400 | * Unregister events and notify userspace. |
| 4342 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4401 | * Notify userspace about cgroup removing only after rmdir of cgroup |
| 4343 | * directory to avoid race between userspace and kernelspace. Use | 4402 | * directory to avoid race between userspace and kernelspace. |
| 4344 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
| 4345 | * cgroup_event_wake() is called with the wait queue head locked, | ||
| 4346 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
| 4347 | */ | 4403 | */ |
| 4348 | spin_lock(&cgrp->event_list_lock); | 4404 | spin_lock(&cgrp->event_list_lock); |
| 4349 | list_splice_init(&cgrp->event_list, &tmp_list); | 4405 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { |
| 4350 | spin_unlock(&cgrp->event_list_lock); | ||
| 4351 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
| 4352 | list_del_init(&event->list); | 4406 | list_del_init(&event->list); |
| 4353 | remove_wait_queue(event->wqh, &event->wait); | ||
| 4354 | eventfd_signal(event->eventfd, 1); | ||
| 4355 | schedule_work(&event->remove); | 4407 | schedule_work(&event->remove); |
| 4356 | } | 4408 | } |
| 4409 | spin_unlock(&cgrp->event_list_lock); | ||
| 4357 | 4410 | ||
| 4358 | return 0; | 4411 | return 0; |
| 4359 | } | 4412 | } |
| @@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4438 | { | 4491 | { |
| 4439 | struct cgroup_subsys_state *css; | 4492 | struct cgroup_subsys_state *css; |
| 4440 | int i, ret; | 4493 | int i, ret; |
| 4494 | struct hlist_node *tmp; | ||
| 4495 | struct css_set *cg; | ||
| 4496 | unsigned long key; | ||
| 4441 | 4497 | ||
| 4442 | /* check name and function validity */ | 4498 | /* check name and function validity */ |
| 4443 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4499 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
| @@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4503 | * this is all done under the css_set_lock. | 4559 | * this is all done under the css_set_lock. |
| 4504 | */ | 4560 | */ |
| 4505 | write_lock(&css_set_lock); | 4561 | write_lock(&css_set_lock); |
| 4506 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | 4562 | hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { |
| 4507 | struct css_set *cg; | 4563 | /* skip entries that we already rehashed */ |
| 4508 | struct hlist_node *node, *tmp; | 4564 | if (cg->subsys[ss->subsys_id]) |
| 4509 | struct hlist_head *bucket = &css_set_table[i], *new_bucket; | 4565 | continue; |
| 4510 | 4566 | /* remove existing entry */ | |
| 4511 | hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { | 4567 | hash_del(&cg->hlist); |
| 4512 | /* skip entries that we already rehashed */ | 4568 | /* set new value */ |
| 4513 | if (cg->subsys[ss->subsys_id]) | 4569 | cg->subsys[ss->subsys_id] = css; |
| 4514 | continue; | 4570 | /* recompute hash and restore entry */ |
| 4515 | /* remove existing entry */ | 4571 | key = css_set_hash(cg->subsys); |
| 4516 | hlist_del(&cg->hlist); | 4572 | hash_add(css_set_table, &cg->hlist, key); |
| 4517 | /* set new value */ | ||
| 4518 | cg->subsys[ss->subsys_id] = css; | ||
| 4519 | /* recompute hash and restore entry */ | ||
| 4520 | new_bucket = css_set_hash(cg->subsys); | ||
| 4521 | hlist_add_head(&cg->hlist, new_bucket); | ||
| 4522 | } | ||
| 4523 | } | 4573 | } |
| 4524 | write_unlock(&css_set_lock); | 4574 | write_unlock(&css_set_lock); |
| 4525 | 4575 | ||
| @@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); | |||
| 4551 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4601 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
| 4552 | { | 4602 | { |
| 4553 | struct cg_cgroup_link *link; | 4603 | struct cg_cgroup_link *link; |
| 4554 | struct hlist_head *hhead; | ||
| 4555 | 4604 | ||
| 4556 | BUG_ON(ss->module == NULL); | 4605 | BUG_ON(ss->module == NULL); |
| 4557 | 4606 | ||
| @@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
| 4567 | offline_css(ss, dummytop); | 4616 | offline_css(ss, dummytop); |
| 4568 | ss->active = 0; | 4617 | ss->active = 0; |
| 4569 | 4618 | ||
| 4570 | if (ss->use_id) { | 4619 | if (ss->use_id) |
| 4571 | idr_remove_all(&ss->idr); | ||
| 4572 | idr_destroy(&ss->idr); | 4620 | idr_destroy(&ss->idr); |
| 4573 | } | ||
| 4574 | 4621 | ||
| 4575 | /* deassign the subsys_id */ | 4622 | /* deassign the subsys_id */ |
| 4576 | subsys[ss->subsys_id] = NULL; | 4623 | subsys[ss->subsys_id] = NULL; |
| @@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
| 4585 | write_lock(&css_set_lock); | 4632 | write_lock(&css_set_lock); |
| 4586 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | 4633 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { |
| 4587 | struct css_set *cg = link->cg; | 4634 | struct css_set *cg = link->cg; |
| 4635 | unsigned long key; | ||
| 4588 | 4636 | ||
| 4589 | hlist_del(&cg->hlist); | 4637 | hash_del(&cg->hlist); |
| 4590 | cg->subsys[ss->subsys_id] = NULL; | 4638 | cg->subsys[ss->subsys_id] = NULL; |
| 4591 | hhead = css_set_hash(cg->subsys); | 4639 | key = css_set_hash(cg->subsys); |
| 4592 | hlist_add_head(&cg->hlist, hhead); | 4640 | hash_add(css_set_table, &cg->hlist, key); |
| 4593 | } | 4641 | } |
| 4594 | write_unlock(&css_set_lock); | 4642 | write_unlock(&css_set_lock); |
| 4595 | 4643 | ||
| @@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void) | |||
| 4631 | list_add(&init_css_set_link.cg_link_list, | 4679 | list_add(&init_css_set_link.cg_link_list, |
| 4632 | &init_css_set.cg_links); | 4680 | &init_css_set.cg_links); |
| 4633 | 4681 | ||
| 4634 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | ||
| 4635 | INIT_HLIST_HEAD(&css_set_table[i]); | ||
| 4636 | |||
| 4637 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4682 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 4638 | struct cgroup_subsys *ss = subsys[i]; | 4683 | struct cgroup_subsys *ss = subsys[i]; |
| 4639 | 4684 | ||
| @@ -4667,7 +4712,7 @@ int __init cgroup_init(void) | |||
| 4667 | { | 4712 | { |
| 4668 | int err; | 4713 | int err; |
| 4669 | int i; | 4714 | int i; |
| 4670 | struct hlist_head *hhead; | 4715 | unsigned long key; |
| 4671 | 4716 | ||
| 4672 | err = bdi_init(&cgroup_backing_dev_info); | 4717 | err = bdi_init(&cgroup_backing_dev_info); |
| 4673 | if (err) | 4718 | if (err) |
| @@ -4686,8 +4731,8 @@ int __init cgroup_init(void) | |||
| 4686 | } | 4731 | } |
| 4687 | 4732 | ||
| 4688 | /* Add init_css_set to the hash table */ | 4733 | /* Add init_css_set to the hash table */ |
| 4689 | hhead = css_set_hash(init_css_set.subsys); | 4734 | key = css_set_hash(init_css_set.subsys); |
| 4690 | hlist_add_head(&init_css_set.hlist, hhead); | 4735 | hash_add(css_set_table, &init_css_set.hlist, key); |
| 4691 | BUG_ON(!init_root_id(&rootnode)); | 4736 | BUG_ON(!init_root_id(&rootnode)); |
| 4692 | 4737 | ||
| 4693 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4738 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
| @@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 4982 | } | 5027 | } |
| 4983 | task_unlock(tsk); | 5028 | task_unlock(tsk); |
| 4984 | 5029 | ||
| 4985 | if (cg) | 5030 | put_css_set_taskexit(cg); |
| 4986 | put_css_set_taskexit(cg); | ||
| 4987 | } | 5031 | } |
| 4988 | 5032 | ||
| 4989 | /** | 5033 | /** |
| @@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id); | |||
| 5274 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | 5318 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) |
| 5275 | { | 5319 | { |
| 5276 | struct css_id *newid; | 5320 | struct css_id *newid; |
| 5277 | int myid, error, size; | 5321 | int ret, size; |
| 5278 | 5322 | ||
| 5279 | BUG_ON(!ss->use_id); | 5323 | BUG_ON(!ss->use_id); |
| 5280 | 5324 | ||
| @@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
| 5282 | newid = kzalloc(size, GFP_KERNEL); | 5326 | newid = kzalloc(size, GFP_KERNEL); |
| 5283 | if (!newid) | 5327 | if (!newid) |
| 5284 | return ERR_PTR(-ENOMEM); | 5328 | return ERR_PTR(-ENOMEM); |
| 5285 | /* get id */ | 5329 | |
| 5286 | if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { | 5330 | idr_preload(GFP_KERNEL); |
| 5287 | error = -ENOMEM; | ||
| 5288 | goto err_out; | ||
| 5289 | } | ||
| 5290 | spin_lock(&ss->id_lock); | 5331 | spin_lock(&ss->id_lock); |
| 5291 | /* Don't use 0. allocates an ID of 1-65535 */ | 5332 | /* Don't use 0. allocates an ID of 1-65535 */ |
| 5292 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | 5333 | ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); |
| 5293 | spin_unlock(&ss->id_lock); | 5334 | spin_unlock(&ss->id_lock); |
| 5335 | idr_preload_end(); | ||
| 5294 | 5336 | ||
| 5295 | /* Returns error when there are no free spaces for new ID.*/ | 5337 | /* Returns error when there are no free spaces for new ID.*/ |
| 5296 | if (error) { | 5338 | if (ret < 0) |
| 5297 | error = -ENOSPC; | ||
| 5298 | goto err_out; | 5339 | goto err_out; |
| 5299 | } | ||
| 5300 | if (myid > CSS_ID_MAX) | ||
| 5301 | goto remove_idr; | ||
| 5302 | 5340 | ||
| 5303 | newid->id = myid; | 5341 | newid->id = ret; |
| 5304 | newid->depth = depth; | 5342 | newid->depth = depth; |
| 5305 | return newid; | 5343 | return newid; |
| 5306 | remove_idr: | ||
| 5307 | error = -ENOSPC; | ||
| 5308 | spin_lock(&ss->id_lock); | ||
| 5309 | idr_remove(&ss->idr, myid); | ||
| 5310 | spin_unlock(&ss->id_lock); | ||
| 5311 | err_out: | 5344 | err_out: |
| 5312 | kfree(newid); | 5345 | kfree(newid); |
| 5313 | return ERR_PTR(error); | 5346 | return ERR_PTR(ret); |
| 5314 | 5347 | ||
| 5315 | } | 5348 | } |
| 5316 | 5349 | ||
| @@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
| 5441 | struct inode *inode; | 5474 | struct inode *inode; |
| 5442 | struct cgroup_subsys_state *css; | 5475 | struct cgroup_subsys_state *css; |
| 5443 | 5476 | ||
| 5444 | inode = f->f_dentry->d_inode; | 5477 | inode = file_inode(f); |
| 5445 | /* check in cgroup filesystem dir */ | 5478 | /* check in cgroup filesystem dir */ |
| 5446 | if (inode->i_op != &cgroup_dir_inode_operations) | 5479 | if (inode->i_op != &cgroup_dir_inode_operations) |
| 5447 | return ERR_PTR(-EBADF); | 5480 | return ERR_PTR(-EBADF); |
diff --git a/kernel/compat.c b/kernel/compat.c index f6150e92dfc9..19971d8c7299 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, | |||
| 290 | __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); | 290 | __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); |
| 291 | } | 291 | } |
| 292 | 292 | ||
| 293 | asmlinkage long compat_sys_getitimer(int which, | 293 | COMPAT_SYSCALL_DEFINE2(getitimer, int, which, |
| 294 | struct compat_itimerval __user *it) | 294 | struct compat_itimerval __user *, it) |
| 295 | { | 295 | { |
| 296 | struct itimerval kit; | 296 | struct itimerval kit; |
| 297 | int error; | 297 | int error; |
| @@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which, | |||
| 302 | return error; | 302 | return error; |
| 303 | } | 303 | } |
| 304 | 304 | ||
| 305 | asmlinkage long compat_sys_setitimer(int which, | 305 | COMPAT_SYSCALL_DEFINE3(setitimer, int, which, |
| 306 | struct compat_itimerval __user *in, | 306 | struct compat_itimerval __user *, in, |
| 307 | struct compat_itimerval __user *out) | 307 | struct compat_itimerval __user *, out) |
| 308 | { | 308 | { |
| 309 | struct itimerval kin, kout; | 309 | struct itimerval kin, kout; |
| 310 | int error; | 310 | int error; |
| @@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) | |||
| 381 | memcpy(blocked->sig, &set, sizeof(set)); | 381 | memcpy(blocked->sig, &set, sizeof(set)); |
| 382 | } | 382 | } |
| 383 | 383 | ||
| 384 | asmlinkage long compat_sys_sigprocmask(int how, | 384 | COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, |
| 385 | compat_old_sigset_t __user *nset, | 385 | compat_old_sigset_t __user *, nset, |
| 386 | compat_old_sigset_t __user *oset) | 386 | compat_old_sigset_t __user *, oset) |
| 387 | { | 387 | { |
| 388 | old_sigset_t old_set, new_set; | 388 | old_sigset_t old_set, new_set; |
| 389 | sigset_t new_blocked; | 389 | sigset_t new_blocked; |
| @@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) | |||
| 535 | return 0; | 535 | return 0; |
| 536 | } | 536 | } |
| 537 | 537 | ||
| 538 | asmlinkage long | 538 | COMPAT_SYSCALL_DEFINE4(wait4, |
| 539 | compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, | 539 | compat_pid_t, pid, |
| 540 | struct compat_rusage __user *ru) | 540 | compat_uint_t __user *, stat_addr, |
| 541 | int, options, | ||
| 542 | struct compat_rusage __user *, ru) | ||
| 541 | { | 543 | { |
| 542 | if (!ru) { | 544 | if (!ru) { |
| 543 | return sys_wait4(pid, stat_addr, options, NULL); | 545 | return sys_wait4(pid, stat_addr, options, NULL); |
| @@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, | |||
| 564 | } | 566 | } |
| 565 | } | 567 | } |
| 566 | 568 | ||
| 567 | asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, | 569 | COMPAT_SYSCALL_DEFINE5(waitid, |
| 568 | struct compat_siginfo __user *uinfo, int options, | 570 | int, which, compat_pid_t, pid, |
| 569 | struct compat_rusage __user *uru) | 571 | struct compat_siginfo __user *, uinfo, int, options, |
| 572 | struct compat_rusage __user *, uru) | ||
| 570 | { | 573 | { |
| 571 | siginfo_t info; | 574 | siginfo_t info; |
| 572 | struct rusage ru; | 575 | struct rusage ru; |
| @@ -584,9 +587,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, | |||
| 584 | return ret; | 587 | return ret; |
| 585 | 588 | ||
| 586 | if (uru) { | 589 | if (uru) { |
| 587 | ret = put_compat_rusage(&ru, uru); | 590 | /* sys_waitid() overwrites everything in ru */ |
| 591 | if (COMPAT_USE_64BIT_TIME) | ||
| 592 | ret = copy_to_user(uru, &ru, sizeof(ru)); | ||
| 593 | else | ||
| 594 | ret = put_compat_rusage(&ru, uru); | ||
| 588 | if (ret) | 595 | if (ret) |
| 589 | return ret; | 596 | return -EFAULT; |
| 590 | } | 597 | } |
| 591 | 598 | ||
| 592 | BUG_ON(info.si_code & __SI_MASK); | 599 | BUG_ON(info.si_code & __SI_MASK); |
| @@ -964,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, | |||
| 964 | } | 971 | } |
| 965 | 972 | ||
| 966 | void | 973 | void |
| 967 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | 974 | sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) |
| 968 | { | 975 | { |
| 969 | switch (_NSIG_WORDS) { | 976 | switch (_NSIG_WORDS) { |
| 970 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); | 977 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); |
| @@ -975,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | |||
| 975 | } | 982 | } |
| 976 | EXPORT_SYMBOL_GPL(sigset_from_compat); | 983 | EXPORT_SYMBOL_GPL(sigset_from_compat); |
| 977 | 984 | ||
| 978 | asmlinkage long | 985 | void |
| 979 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | 986 | sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) |
| 980 | struct compat_siginfo __user *uinfo, | 987 | { |
| 981 | struct compat_timespec __user *uts, compat_size_t sigsetsize) | 988 | switch (_NSIG_WORDS) { |
| 989 | case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; | ||
| 990 | case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; | ||
| 991 | case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; | ||
| 992 | case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; | ||
| 993 | } | ||
| 994 | } | ||
| 995 | |||
| 996 | COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, | ||
| 997 | struct compat_siginfo __user *, uinfo, | ||
| 998 | struct compat_timespec __user *, uts, compat_size_t, sigsetsize) | ||
| 982 | { | 999 | { |
| 983 | compat_sigset_t s32; | 1000 | compat_sigset_t s32; |
| 984 | sigset_t s; | 1001 | sigset_t s; |
| @@ -994,7 +1011,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
| 994 | sigset_from_compat(&s, &s32); | 1011 | sigset_from_compat(&s, &s32); |
| 995 | 1012 | ||
| 996 | if (uts) { | 1013 | if (uts) { |
| 997 | if (get_compat_timespec(&t, uts)) | 1014 | if (compat_get_timespec(&t, uts)) |
| 998 | return -EFAULT; | 1015 | return -EFAULT; |
| 999 | } | 1016 | } |
| 1000 | 1017 | ||
| @@ -1006,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
| 1006 | } | 1023 | } |
| 1007 | 1024 | ||
| 1008 | return ret; | 1025 | return ret; |
| 1009 | |||
| 1010 | } | ||
| 1011 | |||
| 1012 | asmlinkage long | ||
| 1013 | compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, | ||
| 1014 | struct compat_siginfo __user *uinfo) | ||
| 1015 | { | ||
| 1016 | siginfo_t info; | ||
| 1017 | |||
| 1018 | if (copy_siginfo_from_user32(&info, uinfo)) | ||
| 1019 | return -EFAULT; | ||
| 1020 | return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); | ||
| 1021 | } | 1026 | } |
| 1022 | 1027 | ||
| 1023 | #ifdef __ARCH_WANT_COMPAT_SYS_TIME | 1028 | #ifdef __ARCH_WANT_COMPAT_SYS_TIME |
| @@ -1060,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) | |||
| 1060 | 1065 | ||
| 1061 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ | 1066 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ |
| 1062 | 1067 | ||
| 1063 | #ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND | ||
| 1064 | asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) | ||
| 1065 | { | ||
| 1066 | sigset_t newset; | ||
| 1067 | compat_sigset_t newset32; | ||
| 1068 | |||
| 1069 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
| 1070 | if (sigsetsize != sizeof(sigset_t)) | ||
| 1071 | return -EINVAL; | ||
| 1072 | |||
| 1073 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) | ||
| 1074 | return -EFAULT; | ||
| 1075 | sigset_from_compat(&newset, &newset32); | ||
| 1076 | return sigsuspend(&newset); | ||
| 1077 | } | ||
| 1078 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | ||
| 1079 | |||
| 1080 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | 1068 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) |
| 1081 | { | 1069 | { |
| 1082 | struct timex txc; | 1070 | struct timex txc; |
| @@ -1215,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
| 1215 | return 0; | 1203 | return 0; |
| 1216 | } | 1204 | } |
| 1217 | 1205 | ||
| 1218 | #ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL | 1206 | COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, |
| 1219 | asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, | 1207 | compat_pid_t, pid, |
| 1220 | struct compat_timespec __user *interval) | 1208 | struct compat_timespec __user *, interval) |
| 1221 | { | 1209 | { |
| 1222 | struct timespec t; | 1210 | struct timespec t; |
| 1223 | int ret; | 1211 | int ret; |
| @@ -1230,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, | |||
| 1230 | return -EFAULT; | 1218 | return -EFAULT; |
| 1231 | return ret; | 1219 | return ret; |
| 1232 | } | 1220 | } |
| 1233 | #endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ | ||
| 1234 | 1221 | ||
| 1235 | /* | 1222 | /* |
| 1236 | * Allocate user-space memory for the duration of a single system call, | 1223 | * Allocate user-space memory for the duration of a single system call, |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd55508..65349f07b878 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -1,29 +1,41 @@ | |||
| 1 | /* | ||
| 2 | * Context tracking: Probe on high level context boundaries such as kernel | ||
| 3 | * and userspace. This includes syscalls and exceptions entry/exit. | ||
| 4 | * | ||
| 5 | * This is used by RCU to remove its dependency on the timer tick while a CPU | ||
| 6 | * runs in userspace. | ||
| 7 | * | ||
| 8 | * Started by Frederic Weisbecker: | ||
| 9 | * | ||
| 10 | * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> | ||
| 11 | * | ||
| 12 | * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, | ||
| 13 | * Steven Rostedt, Peter Zijlstra for suggestions and improvements. | ||
| 14 | * | ||
| 15 | */ | ||
| 16 | |||
| 1 | #include <linux/context_tracking.h> | 17 | #include <linux/context_tracking.h> |
| 18 | #include <linux/kvm_host.h> | ||
| 2 | #include <linux/rcupdate.h> | 19 | #include <linux/rcupdate.h> |
| 3 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 4 | #include <linux/percpu.h> | ||
| 5 | #include <linux/hardirq.h> | 21 | #include <linux/hardirq.h> |
| 22 | #include <linux/export.h> | ||
| 6 | 23 | ||
| 7 | struct context_tracking { | 24 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { |
| 8 | /* | ||
| 9 | * When active is false, hooks are not set to | ||
| 10 | * minimize overhead: TIF flags are cleared | ||
| 11 | * and calls to user_enter/exit are ignored. This | ||
| 12 | * may be further optimized using static keys. | ||
| 13 | */ | ||
| 14 | bool active; | ||
| 15 | enum { | ||
| 16 | IN_KERNEL = 0, | ||
| 17 | IN_USER, | ||
| 18 | } state; | ||
| 19 | }; | ||
| 20 | |||
| 21 | static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | ||
| 22 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | 25 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE |
| 23 | .active = true, | 26 | .active = true, |
| 24 | #endif | 27 | #endif |
| 25 | }; | 28 | }; |
| 26 | 29 | ||
| 30 | /** | ||
| 31 | * user_enter - Inform the context tracking that the CPU is going to | ||
| 32 | * enter userspace mode. | ||
| 33 | * | ||
| 34 | * This function must be called right before we switch from the kernel | ||
| 35 | * to userspace, when it's guaranteed the remaining kernel instructions | ||
| 36 | * to execute won't use any RCU read side critical section because this | ||
| 37 | * function sets RCU in extended quiescent state. | ||
| 38 | */ | ||
| 27 | void user_enter(void) | 39 | void user_enter(void) |
| 28 | { | 40 | { |
| 29 | unsigned long flags; | 41 | unsigned long flags; |
| @@ -39,40 +51,90 @@ void user_enter(void) | |||
| 39 | if (in_interrupt()) | 51 | if (in_interrupt()) |
| 40 | return; | 52 | return; |
| 41 | 53 | ||
| 54 | /* Kernel threads aren't supposed to go to userspace */ | ||
| 42 | WARN_ON_ONCE(!current->mm); | 55 | WARN_ON_ONCE(!current->mm); |
| 43 | 56 | ||
| 44 | local_irq_save(flags); | 57 | local_irq_save(flags); |
| 45 | if (__this_cpu_read(context_tracking.active) && | 58 | if (__this_cpu_read(context_tracking.active) && |
| 46 | __this_cpu_read(context_tracking.state) != IN_USER) { | 59 | __this_cpu_read(context_tracking.state) != IN_USER) { |
| 47 | __this_cpu_write(context_tracking.state, IN_USER); | 60 | /* |
| 61 | * At this stage, only low level arch entry code remains and | ||
| 62 | * then we'll run in userspace. We can assume there won't be | ||
| 63 | * any RCU read-side critical section until the next call to | ||
| 64 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | ||
| 65 | * on the tick. | ||
| 66 | */ | ||
| 67 | vtime_user_enter(current); | ||
| 48 | rcu_user_enter(); | 68 | rcu_user_enter(); |
| 69 | __this_cpu_write(context_tracking.state, IN_USER); | ||
| 49 | } | 70 | } |
| 50 | local_irq_restore(flags); | 71 | local_irq_restore(flags); |
| 51 | } | 72 | } |
| 52 | 73 | ||
| 74 | |||
| 75 | /** | ||
| 76 | * user_exit - Inform the context tracking that the CPU is | ||
| 77 | * exiting userspace mode and entering the kernel. | ||
| 78 | * | ||
| 79 | * This function must be called after we entered the kernel from userspace | ||
| 80 | * before any use of RCU read side critical section. This potentially include | ||
| 81 | * any high level kernel code like syscalls, exceptions, signal handling, etc... | ||
| 82 | * | ||
| 83 | * This call supports re-entrancy. This way it can be called from any exception | ||
| 84 | * handler without needing to know if we came from userspace or not. | ||
| 85 | */ | ||
| 53 | void user_exit(void) | 86 | void user_exit(void) |
| 54 | { | 87 | { |
| 55 | unsigned long flags; | 88 | unsigned long flags; |
| 56 | 89 | ||
| 57 | /* | ||
| 58 | * Some contexts may involve an exception occuring in an irq, | ||
| 59 | * leading to that nesting: | ||
| 60 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
| 61 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
| 62 | * helpers are enough to protect RCU uses inside the exception. So | ||
| 63 | * just return immediately if we detect we are in an IRQ. | ||
| 64 | */ | ||
| 65 | if (in_interrupt()) | 90 | if (in_interrupt()) |
| 66 | return; | 91 | return; |
| 67 | 92 | ||
| 68 | local_irq_save(flags); | 93 | local_irq_save(flags); |
| 69 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 94 | if (__this_cpu_read(context_tracking.state) == IN_USER) { |
| 70 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 95 | /* |
| 96 | * We are going to run code that may use RCU. Inform | ||
| 97 | * RCU core about that (ie: we may need the tick again). | ||
| 98 | */ | ||
| 71 | rcu_user_exit(); | 99 | rcu_user_exit(); |
| 100 | vtime_user_exit(current); | ||
| 101 | __this_cpu_write(context_tracking.state, IN_KERNEL); | ||
| 72 | } | 102 | } |
| 73 | local_irq_restore(flags); | 103 | local_irq_restore(flags); |
| 74 | } | 104 | } |
| 75 | 105 | ||
| 106 | void guest_enter(void) | ||
| 107 | { | ||
| 108 | if (vtime_accounting_enabled()) | ||
| 109 | vtime_guest_enter(current); | ||
| 110 | else | ||
| 111 | __guest_enter(); | ||
| 112 | } | ||
| 113 | EXPORT_SYMBOL_GPL(guest_enter); | ||
| 114 | |||
| 115 | void guest_exit(void) | ||
| 116 | { | ||
| 117 | if (vtime_accounting_enabled()) | ||
| 118 | vtime_guest_exit(current); | ||
| 119 | else | ||
| 120 | __guest_exit(); | ||
| 121 | } | ||
| 122 | EXPORT_SYMBOL_GPL(guest_exit); | ||
| 123 | |||
| 124 | |||
| 125 | /** | ||
| 126 | * context_tracking_task_switch - context switch the syscall callbacks | ||
| 127 | * @prev: the task that is being switched out | ||
| 128 | * @next: the task that is being switched in | ||
| 129 | * | ||
| 130 | * The context tracking uses the syscall slow path to implement its user-kernel | ||
| 131 | * boundaries probes on syscalls. This way it doesn't impact the syscall fast | ||
| 132 | * path on CPUs that don't do context tracking. | ||
| 133 | * | ||
| 134 | * But we need to clear the flag on the previous task because it may later | ||
| 135 | * migrate to some CPU that doesn't do the context tracking. As such the TIF | ||
| 136 | * flag may not be desired there. | ||
| 137 | */ | ||
| 76 | void context_tracking_task_switch(struct task_struct *prev, | 138 | void context_tracking_task_switch(struct task_struct *prev, |
| 77 | struct task_struct *next) | 139 | struct task_struct *next) |
| 78 | { | 140 | { |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242c..b5e4ab2d427e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu) | |||
| 224 | static inline void check_for_tasks(int cpu) | 224 | static inline void check_for_tasks(int cpu) |
| 225 | { | 225 | { |
| 226 | struct task_struct *p; | 226 | struct task_struct *p; |
| 227 | cputime_t utime, stime; | ||
| 227 | 228 | ||
| 228 | write_lock_irq(&tasklist_lock); | 229 | write_lock_irq(&tasklist_lock); |
| 229 | for_each_process(p) { | 230 | for_each_process(p) { |
| 231 | task_cputime(p, &utime, &stime); | ||
| 230 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 232 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
| 231 | (p->utime || p->stime)) | 233 | (utime || stime)) |
| 232 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 234 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " |
| 233 | "(state = %ld, flags = %x)\n", | 235 | "(state = %ld, flags = %x)\n", |
| 234 | p->comm, task_pid_nr(p), cpu, | 236 | p->comm, task_pid_nr(p), cpu, |
| @@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param) | |||
| 254 | return err; | 256 | return err; |
| 255 | 257 | ||
| 256 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 258 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
| 259 | /* Park the stopper thread */ | ||
| 260 | kthread_park(current); | ||
| 257 | return 0; | 261 | return 0; |
| 258 | } | 262 | } |
| 259 | 263 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..4f9dfe43ecbd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -61,14 +61,6 @@ | |||
| 61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
| 62 | 62 | ||
| 63 | /* | 63 | /* |
| 64 | * Workqueue for cpuset related tasks. | ||
| 65 | * | ||
| 66 | * Using kevent workqueue may cause deadlock when memory_migrate | ||
| 67 | * is set. So we create a separate workqueue thread for cpuset. | ||
| 68 | */ | ||
| 69 | static struct workqueue_struct *cpuset_wq; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * Tracks how many cpusets are currently defined in system. | 64 | * Tracks how many cpusets are currently defined in system. |
| 73 | * When there is only one cpuset (the root cpuset) we can | 65 | * When there is only one cpuset (the root cpuset) we can |
| 74 | * short circuit some hooks. | 66 | * short circuit some hooks. |
| @@ -95,18 +87,21 @@ struct cpuset { | |||
| 95 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 87 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
| 96 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 88 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
| 97 | 89 | ||
| 98 | struct cpuset *parent; /* my parent */ | ||
| 99 | |||
| 100 | struct fmeter fmeter; /* memory_pressure filter */ | 90 | struct fmeter fmeter; /* memory_pressure filter */ |
| 101 | 91 | ||
| 92 | /* | ||
| 93 | * Tasks are being attached to this cpuset. Used to prevent | ||
| 94 | * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). | ||
| 95 | */ | ||
| 96 | int attach_in_progress; | ||
| 97 | |||
| 102 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
| 103 | int pn; | 99 | int pn; |
| 104 | 100 | ||
| 105 | /* for custom sched domain */ | 101 | /* for custom sched domain */ |
| 106 | int relax_domain_level; | 102 | int relax_domain_level; |
| 107 | 103 | ||
| 108 | /* used for walking a cpuset hierarchy */ | 104 | struct work_struct hotplug_work; |
| 109 | struct list_head stack_list; | ||
| 110 | }; | 105 | }; |
| 111 | 106 | ||
| 112 | /* Retrieve the cpuset for a cgroup */ | 107 | /* Retrieve the cpuset for a cgroup */ |
| @@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
| 123 | struct cpuset, css); | 118 | struct cpuset, css); |
| 124 | } | 119 | } |
| 125 | 120 | ||
| 121 | static inline struct cpuset *parent_cs(const struct cpuset *cs) | ||
| 122 | { | ||
| 123 | struct cgroup *pcgrp = cs->css.cgroup->parent; | ||
| 124 | |||
| 125 | if (pcgrp) | ||
| 126 | return cgroup_cs(pcgrp); | ||
| 127 | return NULL; | ||
| 128 | } | ||
| 129 | |||
| 126 | #ifdef CONFIG_NUMA | 130 | #ifdef CONFIG_NUMA |
| 127 | static inline bool task_has_mempolicy(struct task_struct *task) | 131 | static inline bool task_has_mempolicy(struct task_struct *task) |
| 128 | { | 132 | { |
| @@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) | |||
| 138 | 142 | ||
| 139 | /* bits in struct cpuset flags field */ | 143 | /* bits in struct cpuset flags field */ |
| 140 | typedef enum { | 144 | typedef enum { |
| 145 | CS_ONLINE, | ||
| 141 | CS_CPU_EXCLUSIVE, | 146 | CS_CPU_EXCLUSIVE, |
| 142 | CS_MEM_EXCLUSIVE, | 147 | CS_MEM_EXCLUSIVE, |
| 143 | CS_MEM_HARDWALL, | 148 | CS_MEM_HARDWALL, |
| @@ -147,13 +152,12 @@ typedef enum { | |||
| 147 | CS_SPREAD_SLAB, | 152 | CS_SPREAD_SLAB, |
| 148 | } cpuset_flagbits_t; | 153 | } cpuset_flagbits_t; |
| 149 | 154 | ||
| 150 | /* the type of hotplug event */ | ||
| 151 | enum hotplug_event { | ||
| 152 | CPUSET_CPU_OFFLINE, | ||
| 153 | CPUSET_MEM_OFFLINE, | ||
| 154 | }; | ||
| 155 | |||
| 156 | /* convenient tests for these bits */ | 155 | /* convenient tests for these bits */ |
| 156 | static inline bool is_cpuset_online(const struct cpuset *cs) | ||
| 157 | { | ||
| 158 | return test_bit(CS_ONLINE, &cs->flags); | ||
| 159 | } | ||
| 160 | |||
| 157 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 161 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
| 158 | { | 162 | { |
| 159 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); | 163 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); |
| @@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
| 190 | } | 194 | } |
| 191 | 195 | ||
| 192 | static struct cpuset top_cpuset = { | 196 | static struct cpuset top_cpuset = { |
| 193 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 197 | .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | |
| 198 | (1 << CS_MEM_EXCLUSIVE)), | ||
| 194 | }; | 199 | }; |
| 195 | 200 | ||
| 201 | /** | ||
| 202 | * cpuset_for_each_child - traverse online children of a cpuset | ||
| 203 | * @child_cs: loop cursor pointing to the current child | ||
| 204 | * @pos_cgrp: used for iteration | ||
| 205 | * @parent_cs: target cpuset to walk children of | ||
| 206 | * | ||
| 207 | * Walk @child_cs through the online children of @parent_cs. Must be used | ||
| 208 | * with RCU read locked. | ||
| 209 | */ | ||
| 210 | #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ | ||
| 211 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | ||
| 212 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | ||
| 213 | |||
| 214 | /** | ||
| 215 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | ||
| 216 | * @des_cs: loop cursor pointing to the current descendant | ||
| 217 | * @pos_cgrp: used for iteration | ||
| 218 | * @root_cs: target cpuset to walk ancestor of | ||
| 219 | * | ||
| 220 | * Walk @des_cs through the online descendants of @root_cs. Must be used | ||
| 221 | * with RCU read locked. The caller may modify @pos_cgrp by calling | ||
| 222 | * cgroup_rightmost_descendant() to skip subtree. | ||
| 223 | */ | ||
| 224 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | ||
| 225 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | ||
| 226 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | ||
| 227 | |||
| 196 | /* | 228 | /* |
| 197 | * There are two global mutexes guarding cpuset structures. The first | 229 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
| 198 | * is the main control groups cgroup_mutex, accessed via | 230 | * and callback_mutex. The latter may nest inside the former. We also |
| 199 | * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific | 231 | * require taking task_lock() when dereferencing a task's cpuset pointer. |
| 200 | * callback_mutex, below. They can nest. It is ok to first take | 232 | * See "The task_lock() exception", at the end of this comment. |
| 201 | * cgroup_mutex, then nest callback_mutex. We also require taking | 233 | * |
| 202 | * task_lock() when dereferencing a task's cpuset pointer. See "The | 234 | * A task must hold both mutexes to modify cpusets. If a task holds |
| 203 | * task_lock() exception", at the end of this comment. | 235 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it |
| 204 | * | 236 | * is the only task able to also acquire callback_mutex and be able to |
| 205 | * A task must hold both mutexes to modify cpusets. If a task | 237 | * modify cpusets. It can perform various checks on the cpuset structure |
| 206 | * holds cgroup_mutex, then it blocks others wanting that mutex, | 238 | * first, knowing nothing will change. It can also allocate memory while |
| 207 | * ensuring that it is the only task able to also acquire callback_mutex | 239 | * just holding cpuset_mutex. While it is performing these checks, various |
| 208 | * and be able to modify cpusets. It can perform various checks on | 240 | * callback routines can briefly acquire callback_mutex to query cpusets. |
| 209 | * the cpuset structure first, knowing nothing will change. It can | 241 | * Once it is ready to make the changes, it takes callback_mutex, blocking |
| 210 | * also allocate memory while just holding cgroup_mutex. While it is | 242 | * everyone else. |
| 211 | * performing these checks, various callback routines can briefly | ||
| 212 | * acquire callback_mutex to query cpusets. Once it is ready to make | ||
| 213 | * the changes, it takes callback_mutex, blocking everyone else. | ||
| 214 | * | 243 | * |
| 215 | * Calls to the kernel memory allocator can not be made while holding | 244 | * Calls to the kernel memory allocator can not be made while holding |
| 216 | * callback_mutex, as that would risk double tripping on callback_mutex | 245 | * callback_mutex, as that would risk double tripping on callback_mutex |
| @@ -232,6 +261,7 @@ static struct cpuset top_cpuset = { | |||
| 232 | * guidelines for accessing subsystem state in kernel/cgroup.c | 261 | * guidelines for accessing subsystem state in kernel/cgroup.c |
| 233 | */ | 262 | */ |
| 234 | 263 | ||
| 264 | static DEFINE_MUTEX(cpuset_mutex); | ||
| 235 | static DEFINE_MUTEX(callback_mutex); | 265 | static DEFINE_MUTEX(callback_mutex); |
| 236 | 266 | ||
| 237 | /* | 267 | /* |
| @@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; | |||
| 246 | static DEFINE_SPINLOCK(cpuset_buffer_lock); | 276 | static DEFINE_SPINLOCK(cpuset_buffer_lock); |
| 247 | 277 | ||
| 248 | /* | 278 | /* |
| 279 | * CPU / memory hotplug is handled asynchronously. | ||
| 280 | */ | ||
| 281 | static struct workqueue_struct *cpuset_propagate_hotplug_wq; | ||
| 282 | |||
| 283 | static void cpuset_hotplug_workfn(struct work_struct *work); | ||
| 284 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work); | ||
| 285 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); | ||
| 286 | |||
| 287 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); | ||
| 288 | |||
| 289 | /* | ||
| 249 | * This is ugly, but preserves the userspace API for existing cpuset | 290 | * This is ugly, but preserves the userspace API for existing cpuset |
| 250 | * users. If someone tries to mount the "cpuset" filesystem, we | 291 | * users. If someone tries to mount the "cpuset" filesystem, we |
| 251 | * silently switch it to mount "cgroup" instead | 292 | * silently switch it to mount "cgroup" instead |
| @@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
| 289 | struct cpumask *pmask) | 330 | struct cpumask *pmask) |
| 290 | { | 331 | { |
| 291 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 332 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
| 292 | cs = cs->parent; | 333 | cs = parent_cs(cs); |
| 293 | if (cs) | 334 | if (cs) |
| 294 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); | 335 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); |
| 295 | else | 336 | else |
| @@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 314 | { | 355 | { |
| 315 | while (cs && !nodes_intersects(cs->mems_allowed, | 356 | while (cs && !nodes_intersects(cs->mems_allowed, |
| 316 | node_states[N_MEMORY])) | 357 | node_states[N_MEMORY])) |
| 317 | cs = cs->parent; | 358 | cs = parent_cs(cs); |
| 318 | if (cs) | 359 | if (cs) |
| 319 | nodes_and(*pmask, cs->mems_allowed, | 360 | nodes_and(*pmask, cs->mems_allowed, |
| 320 | node_states[N_MEMORY]); | 361 | node_states[N_MEMORY]); |
| @@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 326 | /* | 367 | /* |
| 327 | * update task's spread flag if cpuset's page/slab spread flag is set | 368 | * update task's spread flag if cpuset's page/slab spread flag is set |
| 328 | * | 369 | * |
| 329 | * Called with callback_mutex/cgroup_mutex held | 370 | * Called with callback_mutex/cpuset_mutex held |
| 330 | */ | 371 | */ |
| 331 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | 372 | static void cpuset_update_task_spread_flag(struct cpuset *cs, |
| 332 | struct task_struct *tsk) | 373 | struct task_struct *tsk) |
| @@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, | |||
| 346 | * | 387 | * |
| 347 | * One cpuset is a subset of another if all its allowed CPUs and | 388 | * One cpuset is a subset of another if all its allowed CPUs and |
| 348 | * Memory Nodes are a subset of the other, and its exclusive flags | 389 | * Memory Nodes are a subset of the other, and its exclusive flags |
| 349 | * are only set if the other's are set. Call holding cgroup_mutex. | 390 | * are only set if the other's are set. Call holding cpuset_mutex. |
| 350 | */ | 391 | */ |
| 351 | 392 | ||
| 352 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 393 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
| @@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
| 395 | * If we replaced the flag and mask values of the current cpuset | 436 | * If we replaced the flag and mask values of the current cpuset |
| 396 | * (cur) with those values in the trial cpuset (trial), would | 437 | * (cur) with those values in the trial cpuset (trial), would |
| 397 | * our various subset and exclusive rules still be valid? Presumes | 438 | * our various subset and exclusive rules still be valid? Presumes |
| 398 | * cgroup_mutex held. | 439 | * cpuset_mutex held. |
| 399 | * | 440 | * |
| 400 | * 'cur' is the address of an actual, in-use cpuset. Operations | 441 | * 'cur' is the address of an actual, in-use cpuset. Operations |
| 401 | * such as list traversal that depend on the actual address of the | 442 | * such as list traversal that depend on the actual address of the |
| @@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 412 | { | 453 | { |
| 413 | struct cgroup *cont; | 454 | struct cgroup *cont; |
| 414 | struct cpuset *c, *par; | 455 | struct cpuset *c, *par; |
| 456 | int ret; | ||
| 457 | |||
| 458 | rcu_read_lock(); | ||
| 415 | 459 | ||
| 416 | /* Each of our child cpusets must be a subset of us */ | 460 | /* Each of our child cpusets must be a subset of us */ |
| 417 | list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { | 461 | ret = -EBUSY; |
| 418 | if (!is_cpuset_subset(cgroup_cs(cont), trial)) | 462 | cpuset_for_each_child(c, cont, cur) |
| 419 | return -EBUSY; | 463 | if (!is_cpuset_subset(c, trial)) |
| 420 | } | 464 | goto out; |
| 421 | 465 | ||
| 422 | /* Remaining checks don't apply to root cpuset */ | 466 | /* Remaining checks don't apply to root cpuset */ |
| 467 | ret = 0; | ||
| 423 | if (cur == &top_cpuset) | 468 | if (cur == &top_cpuset) |
| 424 | return 0; | 469 | goto out; |
| 425 | 470 | ||
| 426 | par = cur->parent; | 471 | par = parent_cs(cur); |
| 427 | 472 | ||
| 428 | /* We must be a subset of our parent cpuset */ | 473 | /* We must be a subset of our parent cpuset */ |
| 474 | ret = -EACCES; | ||
| 429 | if (!is_cpuset_subset(trial, par)) | 475 | if (!is_cpuset_subset(trial, par)) |
| 430 | return -EACCES; | 476 | goto out; |
| 431 | 477 | ||
| 432 | /* | 478 | /* |
| 433 | * If either I or some sibling (!= me) is exclusive, we can't | 479 | * If either I or some sibling (!= me) is exclusive, we can't |
| 434 | * overlap | 480 | * overlap |
| 435 | */ | 481 | */ |
| 436 | list_for_each_entry(cont, &par->css.cgroup->children, sibling) { | 482 | ret = -EINVAL; |
| 437 | c = cgroup_cs(cont); | 483 | cpuset_for_each_child(c, cont, par) { |
| 438 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 484 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
| 439 | c != cur && | 485 | c != cur && |
| 440 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 486 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
| 441 | return -EINVAL; | 487 | goto out; |
| 442 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && | 488 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && |
| 443 | c != cur && | 489 | c != cur && |
| 444 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) | 490 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) |
| 445 | return -EINVAL; | 491 | goto out; |
| 446 | } | 492 | } |
| 447 | 493 | ||
| 448 | /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ | 494 | /* |
| 449 | if (cgroup_task_count(cur->css.cgroup)) { | 495 | * Cpusets with tasks - existing or newly being attached - can't |
| 450 | if (cpumask_empty(trial->cpus_allowed) || | 496 | * have empty cpus_allowed or mems_allowed. |
| 451 | nodes_empty(trial->mems_allowed)) { | 497 | */ |
| 452 | return -ENOSPC; | 498 | ret = -ENOSPC; |
| 453 | } | 499 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && |
| 454 | } | 500 | (cpumask_empty(trial->cpus_allowed) || |
| 501 | nodes_empty(trial->mems_allowed))) | ||
| 502 | goto out; | ||
| 455 | 503 | ||
| 456 | return 0; | 504 | ret = 0; |
| 505 | out: | ||
| 506 | rcu_read_unlock(); | ||
| 507 | return ret; | ||
| 457 | } | 508 | } |
| 458 | 509 | ||
| 459 | #ifdef CONFIG_SMP | 510 | #ifdef CONFIG_SMP |
| @@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 474 | return; | 525 | return; |
| 475 | } | 526 | } |
| 476 | 527 | ||
| 477 | static void | 528 | static void update_domain_attr_tree(struct sched_domain_attr *dattr, |
| 478 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | 529 | struct cpuset *root_cs) |
| 479 | { | 530 | { |
| 480 | LIST_HEAD(q); | 531 | struct cpuset *cp; |
| 481 | 532 | struct cgroup *pos_cgrp; | |
| 482 | list_add(&c->stack_list, &q); | ||
| 483 | while (!list_empty(&q)) { | ||
| 484 | struct cpuset *cp; | ||
| 485 | struct cgroup *cont; | ||
| 486 | struct cpuset *child; | ||
| 487 | |||
| 488 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
| 489 | list_del(q.next); | ||
| 490 | 533 | ||
| 491 | if (cpumask_empty(cp->cpus_allowed)) | 534 | rcu_read_lock(); |
| 535 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
| 536 | /* skip the whole subtree if @cp doesn't have any CPU */ | ||
| 537 | if (cpumask_empty(cp->cpus_allowed)) { | ||
| 538 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
| 492 | continue; | 539 | continue; |
| 540 | } | ||
| 493 | 541 | ||
| 494 | if (is_sched_load_balance(cp)) | 542 | if (is_sched_load_balance(cp)) |
| 495 | update_domain_attr(dattr, cp); | 543 | update_domain_attr(dattr, cp); |
| 496 | |||
| 497 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
| 498 | child = cgroup_cs(cont); | ||
| 499 | list_add_tail(&child->stack_list, &q); | ||
| 500 | } | ||
| 501 | } | 544 | } |
| 545 | rcu_read_unlock(); | ||
| 502 | } | 546 | } |
| 503 | 547 | ||
| 504 | /* | 548 | /* |
| @@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 520 | * domains when operating in the severe memory shortage situations | 564 | * domains when operating in the severe memory shortage situations |
| 521 | * that could cause allocation failures below. | 565 | * that could cause allocation failures below. |
| 522 | * | 566 | * |
| 523 | * Must be called with cgroup_lock held. | 567 | * Must be called with cpuset_mutex held. |
| 524 | * | 568 | * |
| 525 | * The three key local variables below are: | 569 | * The three key local variables below are: |
| 526 | * q - a linked-list queue of cpuset pointers, used to implement a | 570 | * q - a linked-list queue of cpuset pointers, used to implement a |
| @@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 558 | static int generate_sched_domains(cpumask_var_t **domains, | 602 | static int generate_sched_domains(cpumask_var_t **domains, |
| 559 | struct sched_domain_attr **attributes) | 603 | struct sched_domain_attr **attributes) |
| 560 | { | 604 | { |
| 561 | LIST_HEAD(q); /* queue of cpusets to be scanned */ | ||
| 562 | struct cpuset *cp; /* scans q */ | 605 | struct cpuset *cp; /* scans q */ |
| 563 | struct cpuset **csa; /* array of all cpuset ptrs */ | 606 | struct cpuset **csa; /* array of all cpuset ptrs */ |
| 564 | int csn; /* how many cpuset ptrs in csa so far */ | 607 | int csn; /* how many cpuset ptrs in csa so far */ |
| @@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 567 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 610 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
| 568 | int ndoms = 0; /* number of sched domains in result */ | 611 | int ndoms = 0; /* number of sched domains in result */ |
| 569 | int nslot; /* next empty doms[] struct cpumask slot */ | 612 | int nslot; /* next empty doms[] struct cpumask slot */ |
| 613 | struct cgroup *pos_cgrp; | ||
| 570 | 614 | ||
| 571 | doms = NULL; | 615 | doms = NULL; |
| 572 | dattr = NULL; | 616 | dattr = NULL; |
| @@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 594 | goto done; | 638 | goto done; |
| 595 | csn = 0; | 639 | csn = 0; |
| 596 | 640 | ||
| 597 | list_add(&top_cpuset.stack_list, &q); | 641 | rcu_read_lock(); |
| 598 | while (!list_empty(&q)) { | 642 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { |
| 599 | struct cgroup *cont; | ||
| 600 | struct cpuset *child; /* scans child cpusets of cp */ | ||
| 601 | |||
| 602 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
| 603 | list_del(q.next); | ||
| 604 | |||
| 605 | if (cpumask_empty(cp->cpus_allowed)) | ||
| 606 | continue; | ||
| 607 | |||
| 608 | /* | 643 | /* |
| 609 | * All child cpusets contain a subset of the parent's cpus, so | 644 | * Continue traversing beyond @cp iff @cp has some CPUs and |
| 610 | * just skip them, and then we call update_domain_attr_tree() | 645 | * isn't load balancing. The former is obvious. The |
| 611 | * to calc relax_domain_level of the corresponding sched | 646 | * latter: All child cpusets contain a subset of the |
| 612 | * domain. | 647 | * parent's cpus, so just skip them, and then we call |
| 648 | * update_domain_attr_tree() to calc relax_domain_level of | ||
| 649 | * the corresponding sched domain. | ||
| 613 | */ | 650 | */ |
| 614 | if (is_sched_load_balance(cp)) { | 651 | if (!cpumask_empty(cp->cpus_allowed) && |
| 615 | csa[csn++] = cp; | 652 | !is_sched_load_balance(cp)) |
| 616 | continue; | 653 | continue; |
| 617 | } | ||
| 618 | 654 | ||
| 619 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 655 | if (is_sched_load_balance(cp)) |
| 620 | child = cgroup_cs(cont); | 656 | csa[csn++] = cp; |
| 621 | list_add_tail(&child->stack_list, &q); | 657 | |
| 622 | } | 658 | /* skip @cp's subtree */ |
| 623 | } | 659 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); |
| 660 | } | ||
| 661 | rcu_read_unlock(); | ||
| 624 | 662 | ||
| 625 | for (i = 0; i < csn; i++) | 663 | for (i = 0; i < csn; i++) |
| 626 | csa[i]->pn = i; | 664 | csa[i]->pn = i; |
| @@ -725,25 +763,25 @@ done: | |||
| 725 | /* | 763 | /* |
| 726 | * Rebuild scheduler domains. | 764 | * Rebuild scheduler domains. |
| 727 | * | 765 | * |
| 728 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | 766 | * If the flag 'sched_load_balance' of any cpuset with non-empty |
| 729 | * Takes both cgroup_mutex and get_online_cpus(). | 767 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset |
| 768 | * which has that flag enabled, or if any cpuset with a non-empty | ||
| 769 | * 'cpus' is removed, then call this routine to rebuild the | ||
| 770 | * scheduler's dynamic sched domains. | ||
| 730 | * | 771 | * |
| 731 | * Cannot be directly called from cpuset code handling changes | 772 | * Call with cpuset_mutex held. Takes get_online_cpus(). |
| 732 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
| 733 | * from code that already holds cgroup_mutex. | ||
| 734 | */ | 773 | */ |
| 735 | static void do_rebuild_sched_domains(struct work_struct *unused) | 774 | static void rebuild_sched_domains_locked(void) |
| 736 | { | 775 | { |
| 737 | struct sched_domain_attr *attr; | 776 | struct sched_domain_attr *attr; |
| 738 | cpumask_var_t *doms; | 777 | cpumask_var_t *doms; |
| 739 | int ndoms; | 778 | int ndoms; |
| 740 | 779 | ||
| 780 | lockdep_assert_held(&cpuset_mutex); | ||
| 741 | get_online_cpus(); | 781 | get_online_cpus(); |
| 742 | 782 | ||
| 743 | /* Generate domain masks and attrs */ | 783 | /* Generate domain masks and attrs */ |
| 744 | cgroup_lock(); | ||
| 745 | ndoms = generate_sched_domains(&doms, &attr); | 784 | ndoms = generate_sched_domains(&doms, &attr); |
| 746 | cgroup_unlock(); | ||
| 747 | 785 | ||
| 748 | /* Have scheduler rebuild the domains */ | 786 | /* Have scheduler rebuild the domains */ |
| 749 | partition_sched_domains(ndoms, doms, attr); | 787 | partition_sched_domains(ndoms, doms, attr); |
| @@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) | |||
| 751 | put_online_cpus(); | 789 | put_online_cpus(); |
| 752 | } | 790 | } |
| 753 | #else /* !CONFIG_SMP */ | 791 | #else /* !CONFIG_SMP */ |
| 754 | static void do_rebuild_sched_domains(struct work_struct *unused) | 792 | static void rebuild_sched_domains_locked(void) |
| 755 | { | 793 | { |
| 756 | } | 794 | } |
| 757 | 795 | ||
| @@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 763 | } | 801 | } |
| 764 | #endif /* CONFIG_SMP */ | 802 | #endif /* CONFIG_SMP */ |
| 765 | 803 | ||
| 766 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); | ||
| 767 | |||
| 768 | /* | ||
| 769 | * Rebuild scheduler domains, asynchronously via workqueue. | ||
| 770 | * | ||
| 771 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
| 772 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
| 773 | * which has that flag enabled, or if any cpuset with a non-empty | ||
| 774 | * 'cpus' is removed, then call this routine to rebuild the | ||
| 775 | * scheduler's dynamic sched domains. | ||
| 776 | * | ||
| 777 | * The rebuild_sched_domains() and partition_sched_domains() | ||
| 778 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
| 779 | * but such cpuset changes as these must nest that locking the | ||
| 780 | * other way, holding cgroup_lock() for much of the code. | ||
| 781 | * | ||
| 782 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
| 783 | * these user changes delegates the actual sched domain rebuilding | ||
| 784 | * to a separate workqueue thread, which ends up processing the | ||
| 785 | * above do_rebuild_sched_domains() function. | ||
| 786 | */ | ||
| 787 | static void async_rebuild_sched_domains(void) | ||
| 788 | { | ||
| 789 | queue_work(cpuset_wq, &rebuild_sched_domains_work); | ||
| 790 | } | ||
| 791 | |||
| 792 | /* | ||
| 793 | * Accomplishes the same scheduler domain rebuild as the above | ||
| 794 | * async_rebuild_sched_domains(), however it directly calls the | ||
| 795 | * rebuild routine synchronously rather than calling it via an | ||
| 796 | * asynchronous work thread. | ||
| 797 | * | ||
| 798 | * This can only be called from code that is not holding | ||
| 799 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
| 800 | */ | ||
| 801 | void rebuild_sched_domains(void) | 804 | void rebuild_sched_domains(void) |
| 802 | { | 805 | { |
| 803 | do_rebuild_sched_domains(NULL); | 806 | mutex_lock(&cpuset_mutex); |
| 807 | rebuild_sched_domains_locked(); | ||
| 808 | mutex_unlock(&cpuset_mutex); | ||
| 804 | } | 809 | } |
| 805 | 810 | ||
| 806 | /** | 811 | /** |
| @@ -808,7 +813,7 @@ void rebuild_sched_domains(void) | |||
| 808 | * @tsk: task to test | 813 | * @tsk: task to test |
| 809 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | 814 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner |
| 810 | * | 815 | * |
| 811 | * Call with cgroup_mutex held. May take callback_mutex during call. | 816 | * Call with cpuset_mutex held. May take callback_mutex during call. |
| 812 | * Called for each task in a cgroup by cgroup_scan_tasks(). | 817 | * Called for each task in a cgroup by cgroup_scan_tasks(). |
| 813 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 818 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other |
| 814 | * words, if its mask is not equal to its cpuset's mask). | 819 | * words, if its mask is not equal to its cpuset's mask). |
| @@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, | |||
| 829 | * cpus_allowed mask needs to be changed. | 834 | * cpus_allowed mask needs to be changed. |
| 830 | * | 835 | * |
| 831 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 836 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
| 832 | * holding cgroup_lock() at this point. | 837 | * holding cpuset_mutex at this point. |
| 833 | */ | 838 | */ |
| 834 | static void cpuset_change_cpumask(struct task_struct *tsk, | 839 | static void cpuset_change_cpumask(struct task_struct *tsk, |
| 835 | struct cgroup_scanner *scan) | 840 | struct cgroup_scanner *scan) |
| @@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
| 842 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 847 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
| 843 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 848 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
| 844 | * | 849 | * |
| 845 | * Called with cgroup_mutex held | 850 | * Called with cpuset_mutex held |
| 846 | * | 851 | * |
| 847 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 852 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
| 848 | * calling callback functions for each. | 853 | * calling callback functions for each. |
| @@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 920 | heap_free(&heap); | 925 | heap_free(&heap); |
| 921 | 926 | ||
| 922 | if (is_load_balanced) | 927 | if (is_load_balanced) |
| 923 | async_rebuild_sched_domains(); | 928 | rebuild_sched_domains_locked(); |
| 924 | return 0; | 929 | return 0; |
| 925 | } | 930 | } |
| 926 | 931 | ||
| @@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 932 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 937 | * Temporarilly set tasks mems_allowed to target nodes of migration, |
| 933 | * so that the migration code can allocate pages on these nodes. | 938 | * so that the migration code can allocate pages on these nodes. |
| 934 | * | 939 | * |
| 935 | * Call holding cgroup_mutex, so current's cpuset won't change | 940 | * Call holding cpuset_mutex, so current's cpuset won't change |
| 936 | * during this call, as manage_mutex holds off any cpuset_attach() | 941 | * during this call, as manage_mutex holds off any cpuset_attach() |
| 937 | * calls. Therefore we don't need to take task_lock around the | 942 | * calls. Therefore we don't need to take task_lock around the |
| 938 | * call to guarantee_online_mems(), as we know no one is changing | 943 | * call to guarantee_online_mems(), as we know no one is changing |
| @@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
| 1007 | /* | 1012 | /* |
| 1008 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1013 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
| 1009 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1014 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
| 1010 | * memory_migrate flag is set. Called with cgroup_mutex held. | 1015 | * memory_migrate flag is set. Called with cpuset_mutex held. |
| 1011 | */ | 1016 | */ |
| 1012 | static void cpuset_change_nodemask(struct task_struct *p, | 1017 | static void cpuset_change_nodemask(struct task_struct *p, |
| 1013 | struct cgroup_scanner *scan) | 1018 | struct cgroup_scanner *scan) |
| @@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
| 1016 | struct cpuset *cs; | 1021 | struct cpuset *cs; |
| 1017 | int migrate; | 1022 | int migrate; |
| 1018 | const nodemask_t *oldmem = scan->data; | 1023 | const nodemask_t *oldmem = scan->data; |
| 1019 | static nodemask_t newmems; /* protected by cgroup_mutex */ | 1024 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
| 1020 | 1025 | ||
| 1021 | cs = cgroup_cs(scan->cg); | 1026 | cs = cgroup_cs(scan->cg); |
| 1022 | guarantee_online_mems(cs, &newmems); | 1027 | guarantee_online_mems(cs, &newmems); |
| @@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound; | |||
| 1043 | * @oldmem: old mems_allowed of cpuset cs | 1048 | * @oldmem: old mems_allowed of cpuset cs |
| 1044 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1049 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
| 1045 | * | 1050 | * |
| 1046 | * Called with cgroup_mutex held | 1051 | * Called with cpuset_mutex held |
| 1047 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1052 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
| 1048 | * if @heap != NULL. | 1053 | * if @heap != NULL. |
| 1049 | */ | 1054 | */ |
| @@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
| 1065 | * take while holding tasklist_lock. Forks can happen - the | 1070 | * take while holding tasklist_lock. Forks can happen - the |
| 1066 | * mpol_dup() cpuset_being_rebound check will catch such forks, | 1071 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
| 1067 | * and rebind their vma mempolicies too. Because we still hold | 1072 | * and rebind their vma mempolicies too. Because we still hold |
| 1068 | * the global cgroup_mutex, we know that no other rebind effort | 1073 | * the global cpuset_mutex, we know that no other rebind effort |
| 1069 | * will be contending for the global variable cpuset_being_rebound. | 1074 | * will be contending for the global variable cpuset_being_rebound. |
| 1070 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1075 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
| 1071 | * is idempotent. Also migrate pages in each mm to new nodes. | 1076 | * is idempotent. Also migrate pages in each mm to new nodes. |
| @@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
| 1084 | * mempolicies and if the cpuset is marked 'memory_migrate', | 1089 | * mempolicies and if the cpuset is marked 'memory_migrate', |
| 1085 | * migrate the tasks pages to the new memory. | 1090 | * migrate the tasks pages to the new memory. |
| 1086 | * | 1091 | * |
| 1087 | * Call with cgroup_mutex held. May take callback_mutex during call. | 1092 | * Call with cpuset_mutex held. May take callback_mutex during call. |
| 1088 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1093 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
| 1089 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1094 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
| 1090 | * their mempolicies to the cpusets new mems_allowed. | 1095 | * their mempolicies to the cpusets new mems_allowed. |
| @@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
| 1168 | cs->relax_domain_level = val; | 1173 | cs->relax_domain_level = val; |
| 1169 | if (!cpumask_empty(cs->cpus_allowed) && | 1174 | if (!cpumask_empty(cs->cpus_allowed) && |
| 1170 | is_sched_load_balance(cs)) | 1175 | is_sched_load_balance(cs)) |
| 1171 | async_rebuild_sched_domains(); | 1176 | rebuild_sched_domains_locked(); |
| 1172 | } | 1177 | } |
| 1173 | 1178 | ||
| 1174 | return 0; | 1179 | return 0; |
| @@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
| 1182 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1187 | * Called by cgroup_scan_tasks() for each task in a cgroup. |
| 1183 | * | 1188 | * |
| 1184 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1189 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
| 1185 | * holding cgroup_lock() at this point. | 1190 | * holding cpuset_mutex at this point. |
| 1186 | */ | 1191 | */ |
| 1187 | static void cpuset_change_flag(struct task_struct *tsk, | 1192 | static void cpuset_change_flag(struct task_struct *tsk, |
| 1188 | struct cgroup_scanner *scan) | 1193 | struct cgroup_scanner *scan) |
| @@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk, | |||
| 1195 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1200 | * @cs: the cpuset in which each task's spread flags needs to be changed |
| 1196 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1201 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
| 1197 | * | 1202 | * |
| 1198 | * Called with cgroup_mutex held | 1203 | * Called with cpuset_mutex held |
| 1199 | * | 1204 | * |
| 1200 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1205 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
| 1201 | * calling callback functions for each. | 1206 | * calling callback functions for each. |
| @@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | |||
| 1220 | * cs: the cpuset to update | 1225 | * cs: the cpuset to update |
| 1221 | * turning_on: whether the flag is being set or cleared | 1226 | * turning_on: whether the flag is being set or cleared |
| 1222 | * | 1227 | * |
| 1223 | * Call with cgroup_mutex held. | 1228 | * Call with cpuset_mutex held. |
| 1224 | */ | 1229 | */ |
| 1225 | 1230 | ||
| 1226 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | 1231 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
| @@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
| 1260 | mutex_unlock(&callback_mutex); | 1265 | mutex_unlock(&callback_mutex); |
| 1261 | 1266 | ||
| 1262 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | 1267 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
| 1263 | async_rebuild_sched_domains(); | 1268 | rebuild_sched_domains_locked(); |
| 1264 | 1269 | ||
| 1265 | if (spread_flag_changed) | 1270 | if (spread_flag_changed) |
| 1266 | update_tasks_flags(cs, &heap); | 1271 | update_tasks_flags(cs, &heap); |
| @@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
| 1368 | return val; | 1373 | return val; |
| 1369 | } | 1374 | } |
| 1370 | 1375 | ||
| 1371 | /* | 1376 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
| 1372 | * Protected by cgroup_lock. The nodemasks must be stored globally because | ||
| 1373 | * dynamically allocating them is not allowed in can_attach, and they must | ||
| 1374 | * persist until attach. | ||
| 1375 | */ | ||
| 1376 | static cpumask_var_t cpus_attach; | ||
| 1377 | static nodemask_t cpuset_attach_nodemask_from; | ||
| 1378 | static nodemask_t cpuset_attach_nodemask_to; | ||
| 1379 | |||
| 1380 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | ||
| 1381 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1377 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
| 1382 | { | 1378 | { |
| 1383 | struct cpuset *cs = cgroup_cs(cgrp); | 1379 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1384 | struct task_struct *task; | 1380 | struct task_struct *task; |
| 1385 | int ret; | 1381 | int ret; |
| 1386 | 1382 | ||
| 1383 | mutex_lock(&cpuset_mutex); | ||
| 1384 | |||
| 1385 | ret = -ENOSPC; | ||
| 1387 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1386 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
| 1388 | return -ENOSPC; | 1387 | goto out_unlock; |
| 1389 | 1388 | ||
| 1390 | cgroup_taskset_for_each(task, cgrp, tset) { | 1389 | cgroup_taskset_for_each(task, cgrp, tset) { |
| 1391 | /* | 1390 | /* |
| @@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1397 | * set_cpus_allowed_ptr() on all attached tasks before | 1396 | * set_cpus_allowed_ptr() on all attached tasks before |
| 1398 | * cpus_allowed may be changed. | 1397 | * cpus_allowed may be changed. |
| 1399 | */ | 1398 | */ |
| 1399 | ret = -EINVAL; | ||
| 1400 | if (task->flags & PF_THREAD_BOUND) | 1400 | if (task->flags & PF_THREAD_BOUND) |
| 1401 | return -EINVAL; | 1401 | goto out_unlock; |
| 1402 | if ((ret = security_task_setscheduler(task))) | 1402 | ret = security_task_setscheduler(task); |
| 1403 | return ret; | 1403 | if (ret) |
| 1404 | goto out_unlock; | ||
| 1404 | } | 1405 | } |
| 1405 | 1406 | ||
| 1406 | /* prepare for attach */ | 1407 | /* |
| 1407 | if (cs == &top_cpuset) | 1408 | * Mark attach is in progress. This makes validate_change() fail |
| 1408 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1409 | * changes which zero cpus/mems_allowed. |
| 1409 | else | 1410 | */ |
| 1410 | guarantee_online_cpus(cs, cpus_attach); | 1411 | cs->attach_in_progress++; |
| 1411 | 1412 | ret = 0; | |
| 1412 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | 1413 | out_unlock: |
| 1414 | mutex_unlock(&cpuset_mutex); | ||
| 1415 | return ret; | ||
| 1416 | } | ||
| 1413 | 1417 | ||
| 1414 | return 0; | 1418 | static void cpuset_cancel_attach(struct cgroup *cgrp, |
| 1419 | struct cgroup_taskset *tset) | ||
| 1420 | { | ||
| 1421 | mutex_lock(&cpuset_mutex); | ||
| 1422 | cgroup_cs(cgrp)->attach_in_progress--; | ||
| 1423 | mutex_unlock(&cpuset_mutex); | ||
| 1415 | } | 1424 | } |
| 1416 | 1425 | ||
| 1426 | /* | ||
| 1427 | * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() | ||
| 1428 | * but we can't allocate it dynamically there. Define it global and | ||
| 1429 | * allocate from cpuset_init(). | ||
| 1430 | */ | ||
| 1431 | static cpumask_var_t cpus_attach; | ||
| 1432 | |||
| 1417 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1433 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
| 1418 | { | 1434 | { |
| 1435 | /* static bufs protected by cpuset_mutex */ | ||
| 1436 | static nodemask_t cpuset_attach_nodemask_from; | ||
| 1437 | static nodemask_t cpuset_attach_nodemask_to; | ||
| 1419 | struct mm_struct *mm; | 1438 | struct mm_struct *mm; |
| 1420 | struct task_struct *task; | 1439 | struct task_struct *task; |
| 1421 | struct task_struct *leader = cgroup_taskset_first(tset); | 1440 | struct task_struct *leader = cgroup_taskset_first(tset); |
| @@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1423 | struct cpuset *cs = cgroup_cs(cgrp); | 1442 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1424 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1443 | struct cpuset *oldcs = cgroup_cs(oldcgrp); |
| 1425 | 1444 | ||
| 1445 | mutex_lock(&cpuset_mutex); | ||
| 1446 | |||
| 1447 | /* prepare for attach */ | ||
| 1448 | if (cs == &top_cpuset) | ||
| 1449 | cpumask_copy(cpus_attach, cpu_possible_mask); | ||
| 1450 | else | ||
| 1451 | guarantee_online_cpus(cs, cpus_attach); | ||
| 1452 | |||
| 1453 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | ||
| 1454 | |||
| 1426 | cgroup_taskset_for_each(task, cgrp, tset) { | 1455 | cgroup_taskset_for_each(task, cgrp, tset) { |
| 1427 | /* | 1456 | /* |
| 1428 | * can_attach beforehand should guarantee that this doesn't | 1457 | * can_attach beforehand should guarantee that this doesn't |
| @@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1448 | &cpuset_attach_nodemask_to); | 1477 | &cpuset_attach_nodemask_to); |
| 1449 | mmput(mm); | 1478 | mmput(mm); |
| 1450 | } | 1479 | } |
| 1480 | |||
| 1481 | cs->attach_in_progress--; | ||
| 1482 | |||
| 1483 | /* | ||
| 1484 | * We may have raced with CPU/memory hotunplug. Trigger hotplug | ||
| 1485 | * propagation if @cs doesn't have any CPU or memory. It will move | ||
| 1486 | * the newly added tasks to the nearest parent which can execute. | ||
| 1487 | */ | ||
| 1488 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
| 1489 | schedule_cpuset_propagate_hotplug(cs); | ||
| 1490 | |||
| 1491 | mutex_unlock(&cpuset_mutex); | ||
| 1451 | } | 1492 | } |
| 1452 | 1493 | ||
| 1453 | /* The various types of files and directories in a cpuset file system */ | 1494 | /* The various types of files and directories in a cpuset file system */ |
| @@ -1469,12 +1510,13 @@ typedef enum { | |||
| 1469 | 1510 | ||
| 1470 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1511 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
| 1471 | { | 1512 | { |
| 1472 | int retval = 0; | ||
| 1473 | struct cpuset *cs = cgroup_cs(cgrp); | 1513 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1474 | cpuset_filetype_t type = cft->private; | 1514 | cpuset_filetype_t type = cft->private; |
| 1515 | int retval = -ENODEV; | ||
| 1475 | 1516 | ||
| 1476 | if (!cgroup_lock_live_group(cgrp)) | 1517 | mutex_lock(&cpuset_mutex); |
| 1477 | return -ENODEV; | 1518 | if (!is_cpuset_online(cs)) |
| 1519 | goto out_unlock; | ||
| 1478 | 1520 | ||
| 1479 | switch (type) { | 1521 | switch (type) { |
| 1480 | case FILE_CPU_EXCLUSIVE: | 1522 | case FILE_CPU_EXCLUSIVE: |
| @@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
| 1508 | retval = -EINVAL; | 1550 | retval = -EINVAL; |
| 1509 | break; | 1551 | break; |
| 1510 | } | 1552 | } |
| 1511 | cgroup_unlock(); | 1553 | out_unlock: |
| 1554 | mutex_unlock(&cpuset_mutex); | ||
| 1512 | return retval; | 1555 | return retval; |
| 1513 | } | 1556 | } |
| 1514 | 1557 | ||
| 1515 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1558 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) |
| 1516 | { | 1559 | { |
| 1517 | int retval = 0; | ||
| 1518 | struct cpuset *cs = cgroup_cs(cgrp); | 1560 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1519 | cpuset_filetype_t type = cft->private; | 1561 | cpuset_filetype_t type = cft->private; |
| 1562 | int retval = -ENODEV; | ||
| 1520 | 1563 | ||
| 1521 | if (!cgroup_lock_live_group(cgrp)) | 1564 | mutex_lock(&cpuset_mutex); |
| 1522 | return -ENODEV; | 1565 | if (!is_cpuset_online(cs)) |
| 1566 | goto out_unlock; | ||
| 1523 | 1567 | ||
| 1524 | switch (type) { | 1568 | switch (type) { |
| 1525 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1569 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
| @@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
| 1529 | retval = -EINVAL; | 1573 | retval = -EINVAL; |
| 1530 | break; | 1574 | break; |
| 1531 | } | 1575 | } |
| 1532 | cgroup_unlock(); | 1576 | out_unlock: |
| 1577 | mutex_unlock(&cpuset_mutex); | ||
| 1533 | return retval; | 1578 | return retval; |
| 1534 | } | 1579 | } |
| 1535 | 1580 | ||
| @@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
| 1539 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1584 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, |
| 1540 | const char *buf) | 1585 | const char *buf) |
| 1541 | { | 1586 | { |
| 1542 | int retval = 0; | ||
| 1543 | struct cpuset *cs = cgroup_cs(cgrp); | 1587 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1544 | struct cpuset *trialcs; | 1588 | struct cpuset *trialcs; |
| 1589 | int retval = -ENODEV; | ||
| 1590 | |||
| 1591 | /* | ||
| 1592 | * CPU or memory hotunplug may leave @cs w/o any execution | ||
| 1593 | * resources, in which case the hotplug code asynchronously updates | ||
| 1594 | * configuration and transfers all tasks to the nearest ancestor | ||
| 1595 | * which can execute. | ||
| 1596 | * | ||
| 1597 | * As writes to "cpus" or "mems" may restore @cs's execution | ||
| 1598 | * resources, wait for the previously scheduled operations before | ||
| 1599 | * proceeding, so that we don't end up keep removing tasks added | ||
| 1600 | * after execution capability is restored. | ||
| 1601 | * | ||
| 1602 | * Flushing cpuset_hotplug_work is enough to synchronize against | ||
| 1603 | * hotplug hanlding; however, cpuset_attach() may schedule | ||
| 1604 | * propagation work directly. Flush the workqueue too. | ||
| 1605 | */ | ||
| 1606 | flush_work(&cpuset_hotplug_work); | ||
| 1607 | flush_workqueue(cpuset_propagate_hotplug_wq); | ||
| 1545 | 1608 | ||
| 1546 | if (!cgroup_lock_live_group(cgrp)) | 1609 | mutex_lock(&cpuset_mutex); |
| 1547 | return -ENODEV; | 1610 | if (!is_cpuset_online(cs)) |
| 1611 | goto out_unlock; | ||
| 1548 | 1612 | ||
| 1549 | trialcs = alloc_trial_cpuset(cs); | 1613 | trialcs = alloc_trial_cpuset(cs); |
| 1550 | if (!trialcs) { | 1614 | if (!trialcs) { |
| 1551 | retval = -ENOMEM; | 1615 | retval = -ENOMEM; |
| 1552 | goto out; | 1616 | goto out_unlock; |
| 1553 | } | 1617 | } |
| 1554 | 1618 | ||
| 1555 | switch (cft->private) { | 1619 | switch (cft->private) { |
| @@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
| 1565 | } | 1629 | } |
| 1566 | 1630 | ||
| 1567 | free_trial_cpuset(trialcs); | 1631 | free_trial_cpuset(trialcs); |
| 1568 | out: | 1632 | out_unlock: |
| 1569 | cgroup_unlock(); | 1633 | mutex_unlock(&cpuset_mutex); |
| 1570 | return retval; | 1634 | return retval; |
| 1571 | } | 1635 | } |
| 1572 | 1636 | ||
| @@ -1790,15 +1854,12 @@ static struct cftype files[] = { | |||
| 1790 | 1854 | ||
| 1791 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | 1855 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) |
| 1792 | { | 1856 | { |
| 1793 | struct cgroup *parent_cg = cont->parent; | 1857 | struct cpuset *cs; |
| 1794 | struct cgroup *tmp_cg; | ||
| 1795 | struct cpuset *parent, *cs; | ||
| 1796 | 1858 | ||
| 1797 | if (!parent_cg) | 1859 | if (!cont->parent) |
| 1798 | return &top_cpuset.css; | 1860 | return &top_cpuset.css; |
| 1799 | parent = cgroup_cs(parent_cg); | ||
| 1800 | 1861 | ||
| 1801 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1862 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
| 1802 | if (!cs) | 1863 | if (!cs) |
| 1803 | return ERR_PTR(-ENOMEM); | 1864 | return ERR_PTR(-ENOMEM); |
| 1804 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { | 1865 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { |
| @@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | |||
| 1806 | return ERR_PTR(-ENOMEM); | 1867 | return ERR_PTR(-ENOMEM); |
| 1807 | } | 1868 | } |
| 1808 | 1869 | ||
| 1809 | cs->flags = 0; | ||
| 1810 | if (is_spread_page(parent)) | ||
| 1811 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
| 1812 | if (is_spread_slab(parent)) | ||
| 1813 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
| 1814 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1870 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
| 1815 | cpumask_clear(cs->cpus_allowed); | 1871 | cpumask_clear(cs->cpus_allowed); |
| 1816 | nodes_clear(cs->mems_allowed); | 1872 | nodes_clear(cs->mems_allowed); |
| 1817 | fmeter_init(&cs->fmeter); | 1873 | fmeter_init(&cs->fmeter); |
| 1874 | INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); | ||
| 1818 | cs->relax_domain_level = -1; | 1875 | cs->relax_domain_level = -1; |
| 1819 | 1876 | ||
| 1820 | cs->parent = parent; | 1877 | return &cs->css; |
| 1878 | } | ||
| 1879 | |||
| 1880 | static int cpuset_css_online(struct cgroup *cgrp) | ||
| 1881 | { | ||
| 1882 | struct cpuset *cs = cgroup_cs(cgrp); | ||
| 1883 | struct cpuset *parent = parent_cs(cs); | ||
| 1884 | struct cpuset *tmp_cs; | ||
| 1885 | struct cgroup *pos_cg; | ||
| 1886 | |||
| 1887 | if (!parent) | ||
| 1888 | return 0; | ||
| 1889 | |||
| 1890 | mutex_lock(&cpuset_mutex); | ||
| 1891 | |||
| 1892 | set_bit(CS_ONLINE, &cs->flags); | ||
| 1893 | if (is_spread_page(parent)) | ||
| 1894 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
| 1895 | if (is_spread_slab(parent)) | ||
| 1896 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
| 1897 | |||
| 1821 | number_of_cpusets++; | 1898 | number_of_cpusets++; |
| 1822 | 1899 | ||
| 1823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) | 1900 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) |
| 1824 | goto skip_clone; | 1901 | goto out_unlock; |
| 1825 | 1902 | ||
| 1826 | /* | 1903 | /* |
| 1827 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | 1904 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is |
| @@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | |||
| 1836 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | 1913 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive |
| 1837 | * (and likewise for mems) to the new cgroup. | 1914 | * (and likewise for mems) to the new cgroup. |
| 1838 | */ | 1915 | */ |
| 1839 | list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { | 1916 | rcu_read_lock(); |
| 1840 | struct cpuset *tmp_cs = cgroup_cs(tmp_cg); | 1917 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { |
| 1841 | 1918 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | |
| 1842 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) | 1919 | rcu_read_unlock(); |
| 1843 | goto skip_clone; | 1920 | goto out_unlock; |
| 1921 | } | ||
| 1844 | } | 1922 | } |
| 1923 | rcu_read_unlock(); | ||
| 1845 | 1924 | ||
| 1846 | mutex_lock(&callback_mutex); | 1925 | mutex_lock(&callback_mutex); |
| 1847 | cs->mems_allowed = parent->mems_allowed; | 1926 | cs->mems_allowed = parent->mems_allowed; |
| 1848 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | 1927 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
| 1849 | mutex_unlock(&callback_mutex); | 1928 | mutex_unlock(&callback_mutex); |
| 1850 | skip_clone: | 1929 | out_unlock: |
| 1851 | return &cs->css; | 1930 | mutex_unlock(&cpuset_mutex); |
| 1931 | return 0; | ||
| 1932 | } | ||
| 1933 | |||
| 1934 | static void cpuset_css_offline(struct cgroup *cgrp) | ||
| 1935 | { | ||
| 1936 | struct cpuset *cs = cgroup_cs(cgrp); | ||
| 1937 | |||
| 1938 | mutex_lock(&cpuset_mutex); | ||
| 1939 | |||
| 1940 | if (is_sched_load_balance(cs)) | ||
| 1941 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | ||
| 1942 | |||
| 1943 | number_of_cpusets--; | ||
| 1944 | clear_bit(CS_ONLINE, &cs->flags); | ||
| 1945 | |||
| 1946 | mutex_unlock(&cpuset_mutex); | ||
| 1852 | } | 1947 | } |
| 1853 | 1948 | ||
| 1854 | /* | 1949 | /* |
| 1855 | * If the cpuset being removed has its flag 'sched_load_balance' | 1950 | * If the cpuset being removed has its flag 'sched_load_balance' |
| 1856 | * enabled, then simulate turning sched_load_balance off, which | 1951 | * enabled, then simulate turning sched_load_balance off, which |
| 1857 | * will call async_rebuild_sched_domains(). | 1952 | * will call rebuild_sched_domains_locked(). |
| 1858 | */ | 1953 | */ |
| 1859 | 1954 | ||
| 1860 | static void cpuset_css_free(struct cgroup *cont) | 1955 | static void cpuset_css_free(struct cgroup *cont) |
| 1861 | { | 1956 | { |
| 1862 | struct cpuset *cs = cgroup_cs(cont); | 1957 | struct cpuset *cs = cgroup_cs(cont); |
| 1863 | 1958 | ||
| 1864 | if (is_sched_load_balance(cs)) | ||
| 1865 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | ||
| 1866 | |||
| 1867 | number_of_cpusets--; | ||
| 1868 | free_cpumask_var(cs->cpus_allowed); | 1959 | free_cpumask_var(cs->cpus_allowed); |
| 1869 | kfree(cs); | 1960 | kfree(cs); |
| 1870 | } | 1961 | } |
| @@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont) | |||
| 1872 | struct cgroup_subsys cpuset_subsys = { | 1963 | struct cgroup_subsys cpuset_subsys = { |
| 1873 | .name = "cpuset", | 1964 | .name = "cpuset", |
| 1874 | .css_alloc = cpuset_css_alloc, | 1965 | .css_alloc = cpuset_css_alloc, |
| 1966 | .css_online = cpuset_css_online, | ||
| 1967 | .css_offline = cpuset_css_offline, | ||
| 1875 | .css_free = cpuset_css_free, | 1968 | .css_free = cpuset_css_free, |
| 1876 | .can_attach = cpuset_can_attach, | 1969 | .can_attach = cpuset_can_attach, |
| 1970 | .cancel_attach = cpuset_cancel_attach, | ||
| 1877 | .attach = cpuset_attach, | 1971 | .attach = cpuset_attach, |
| 1878 | .subsys_id = cpuset_subsys_id, | 1972 | .subsys_id = cpuset_subsys_id, |
| 1879 | .base_cftypes = files, | 1973 | .base_cftypes = files, |
| @@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
| 1924 | { | 2018 | { |
| 1925 | struct cgroup *new_cgroup = scan->data; | 2019 | struct cgroup *new_cgroup = scan->data; |
| 1926 | 2020 | ||
| 2021 | cgroup_lock(); | ||
| 1927 | cgroup_attach_task(new_cgroup, tsk); | 2022 | cgroup_attach_task(new_cgroup, tsk); |
| 2023 | cgroup_unlock(); | ||
| 1928 | } | 2024 | } |
| 1929 | 2025 | ||
| 1930 | /** | 2026 | /** |
| @@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
| 1932 | * @from: cpuset in which the tasks currently reside | 2028 | * @from: cpuset in which the tasks currently reside |
| 1933 | * @to: cpuset to which the tasks will be moved | 2029 | * @to: cpuset to which the tasks will be moved |
| 1934 | * | 2030 | * |
| 1935 | * Called with cgroup_mutex held | 2031 | * Called with cpuset_mutex held |
| 1936 | * callback_mutex must not be held, as cpuset_attach() will take it. | 2032 | * callback_mutex must not be held, as cpuset_attach() will take it. |
| 1937 | * | 2033 | * |
| 1938 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 2034 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
| @@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
| 1959 | * removing that CPU or node from all cpusets. If this removes the | 2055 | * removing that CPU or node from all cpusets. If this removes the |
| 1960 | * last CPU or node from a cpuset, then move the tasks in the empty | 2056 | * last CPU or node from a cpuset, then move the tasks in the empty |
| 1961 | * cpuset to its next-highest non-empty parent. | 2057 | * cpuset to its next-highest non-empty parent. |
| 1962 | * | ||
| 1963 | * Called with cgroup_mutex held | ||
| 1964 | * callback_mutex must not be held, as cpuset_attach() will take it. | ||
| 1965 | */ | 2058 | */ |
| 1966 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | 2059 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) |
| 1967 | { | 2060 | { |
| 1968 | struct cpuset *parent; | 2061 | struct cpuset *parent; |
| 1969 | 2062 | ||
| 1970 | /* | 2063 | /* |
| 1971 | * The cgroup's css_sets list is in use if there are tasks | ||
| 1972 | * in the cpuset; the list is empty if there are none; | ||
| 1973 | * the cs->css.refcnt seems always 0. | ||
| 1974 | */ | ||
| 1975 | if (list_empty(&cs->css.cgroup->css_sets)) | ||
| 1976 | return; | ||
| 1977 | |||
| 1978 | /* | ||
| 1979 | * Find its next-highest non-empty parent, (top cpuset | 2064 | * Find its next-highest non-empty parent, (top cpuset |
| 1980 | * has online cpus, so can't be empty). | 2065 | * has online cpus, so can't be empty). |
| 1981 | */ | 2066 | */ |
| 1982 | parent = cs->parent; | 2067 | parent = parent_cs(cs); |
| 1983 | while (cpumask_empty(parent->cpus_allowed) || | 2068 | while (cpumask_empty(parent->cpus_allowed) || |
| 1984 | nodes_empty(parent->mems_allowed)) | 2069 | nodes_empty(parent->mems_allowed)) |
| 1985 | parent = parent->parent; | 2070 | parent = parent_cs(parent); |
| 1986 | 2071 | ||
| 1987 | move_member_tasks_to_cpuset(cs, parent); | 2072 | move_member_tasks_to_cpuset(cs, parent); |
| 1988 | } | 2073 | } |
| 1989 | 2074 | ||
| 1990 | /* | 2075 | /** |
| 1991 | * Helper function to traverse cpusets. | 2076 | * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset |
| 1992 | * It can be used to walk the cpuset tree from top to bottom, completing | 2077 | * @cs: cpuset in interest |
| 1993 | * one layer before dropping down to the next (thus always processing a | 2078 | * |
| 1994 | * node before any of its children). | 2079 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
| 2080 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, | ||
| 2081 | * all its tasks are moved to the nearest ancestor with both resources. | ||
| 1995 | */ | 2082 | */ |
| 1996 | static struct cpuset *cpuset_next(struct list_head *queue) | 2083 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work) |
| 1997 | { | 2084 | { |
| 1998 | struct cpuset *cp; | 2085 | static cpumask_t off_cpus; |
| 1999 | struct cpuset *child; /* scans child cpusets of cp */ | 2086 | static nodemask_t off_mems, tmp_mems; |
| 2000 | struct cgroup *cont; | 2087 | struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); |
| 2088 | bool is_empty; | ||
| 2001 | 2089 | ||
| 2002 | if (list_empty(queue)) | 2090 | mutex_lock(&cpuset_mutex); |
| 2003 | return NULL; | 2091 | |
| 2092 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | ||
| 2093 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | ||
| 2004 | 2094 | ||
| 2005 | cp = list_first_entry(queue, struct cpuset, stack_list); | 2095 | /* remove offline cpus from @cs */ |
| 2006 | list_del(queue->next); | 2096 | if (!cpumask_empty(&off_cpus)) { |
| 2007 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 2097 | mutex_lock(&callback_mutex); |
| 2008 | child = cgroup_cs(cont); | 2098 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); |
| 2009 | list_add_tail(&child->stack_list, queue); | 2099 | mutex_unlock(&callback_mutex); |
| 2100 | update_tasks_cpumask(cs, NULL); | ||
| 2101 | } | ||
| 2102 | |||
| 2103 | /* remove offline mems from @cs */ | ||
| 2104 | if (!nodes_empty(off_mems)) { | ||
| 2105 | tmp_mems = cs->mems_allowed; | ||
| 2106 | mutex_lock(&callback_mutex); | ||
| 2107 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); | ||
| 2108 | mutex_unlock(&callback_mutex); | ||
| 2109 | update_tasks_nodemask(cs, &tmp_mems, NULL); | ||
| 2010 | } | 2110 | } |
| 2011 | 2111 | ||
| 2012 | return cp; | 2112 | is_empty = cpumask_empty(cs->cpus_allowed) || |
| 2113 | nodes_empty(cs->mems_allowed); | ||
| 2114 | |||
| 2115 | mutex_unlock(&cpuset_mutex); | ||
| 2116 | |||
| 2117 | /* | ||
| 2118 | * If @cs became empty, move tasks to the nearest ancestor with | ||
| 2119 | * execution resources. This is full cgroup operation which will | ||
| 2120 | * also call back into cpuset. Should be done outside any lock. | ||
| 2121 | */ | ||
| 2122 | if (is_empty) | ||
| 2123 | remove_tasks_in_empty_cpuset(cs); | ||
| 2124 | |||
| 2125 | /* the following may free @cs, should be the last operation */ | ||
| 2126 | css_put(&cs->css); | ||
| 2013 | } | 2127 | } |
| 2014 | 2128 | ||
| 2129 | /** | ||
| 2130 | * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset | ||
| 2131 | * @cs: cpuset of interest | ||
| 2132 | * | ||
| 2133 | * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and | ||
| 2134 | * memory masks according to top_cpuset. | ||
| 2135 | */ | ||
| 2136 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) | ||
| 2137 | { | ||
| 2138 | /* | ||
| 2139 | * Pin @cs. The refcnt will be released when the work item | ||
| 2140 | * finishes executing. | ||
| 2141 | */ | ||
| 2142 | if (!css_tryget(&cs->css)) | ||
| 2143 | return; | ||
| 2015 | 2144 | ||
| 2016 | /* | 2145 | /* |
| 2017 | * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory | 2146 | * Queue @cs->hotplug_work. If already pending, lose the css ref. |
| 2018 | * online/offline) and update the cpusets accordingly. | 2147 | * cpuset_propagate_hotplug_wq is ordered and propagation will |
| 2019 | * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such | 2148 | * happen in the order this function is called. |
| 2020 | * cpuset must be moved to a parent cpuset. | 2149 | */ |
| 2150 | if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) | ||
| 2151 | css_put(&cs->css); | ||
| 2152 | } | ||
| 2153 | |||
| 2154 | /** | ||
| 2155 | * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset | ||
| 2021 | * | 2156 | * |
| 2022 | * Called with cgroup_mutex held. We take callback_mutex to modify | 2157 | * This function is called after either CPU or memory configuration has |
| 2023 | * cpus_allowed and mems_allowed. | 2158 | * changed and updates cpuset accordingly. The top_cpuset is always |
| 2159 | * synchronized to cpu_active_mask and N_MEMORY, which is necessary in | ||
| 2160 | * order to make cpusets transparent (of no affect) on systems that are | ||
| 2161 | * actively using CPU hotplug but making no active use of cpusets. | ||
| 2024 | * | 2162 | * |
| 2025 | * This walk processes the tree from top to bottom, completing one layer | 2163 | * Non-root cpusets are only affected by offlining. If any CPUs or memory |
| 2026 | * before dropping down to the next. It always processes a node before | 2164 | * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all |
| 2027 | * any of its children. | 2165 | * descendants. |
| 2028 | * | 2166 | * |
| 2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY | 2167 | * Note that CPU offlining during suspend is ignored. We don't modify |
| 2030 | * if all present pages from a node are offlined. | 2168 | * cpusets across suspend/resume cycles at all. |
| 2031 | */ | 2169 | */ |
| 2032 | static void | 2170 | static void cpuset_hotplug_workfn(struct work_struct *work) |
| 2033 | scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | ||
| 2034 | { | 2171 | { |
| 2035 | LIST_HEAD(queue); | 2172 | static cpumask_t new_cpus, tmp_cpus; |
| 2036 | struct cpuset *cp; /* scans cpusets being updated */ | 2173 | static nodemask_t new_mems, tmp_mems; |
| 2037 | static nodemask_t oldmems; /* protected by cgroup_mutex */ | 2174 | bool cpus_updated, mems_updated; |
| 2175 | bool cpus_offlined, mems_offlined; | ||
| 2038 | 2176 | ||
| 2039 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2177 | mutex_lock(&cpuset_mutex); |
| 2040 | 2178 | ||
| 2041 | switch (event) { | 2179 | /* fetch the available cpus/mems and find out which changed how */ |
| 2042 | case CPUSET_CPU_OFFLINE: | 2180 | cpumask_copy(&new_cpus, cpu_active_mask); |
| 2043 | while ((cp = cpuset_next(&queue)) != NULL) { | 2181 | new_mems = node_states[N_MEMORY]; |
| 2044 | 2182 | ||
| 2045 | /* Continue past cpusets with all cpus online */ | 2183 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); |
| 2046 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) | 2184 | cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, |
| 2047 | continue; | 2185 | &new_cpus); |
| 2048 | 2186 | ||
| 2049 | /* Remove offline cpus from this cpuset. */ | 2187 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); |
| 2050 | mutex_lock(&callback_mutex); | 2188 | nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); |
| 2051 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | 2189 | mems_offlined = !nodes_empty(tmp_mems); |
| 2052 | cpu_active_mask); | ||
| 2053 | mutex_unlock(&callback_mutex); | ||
| 2054 | 2190 | ||
| 2055 | /* Move tasks from the empty cpuset to a parent */ | 2191 | /* synchronize cpus_allowed to cpu_active_mask */ |
| 2056 | if (cpumask_empty(cp->cpus_allowed)) | 2192 | if (cpus_updated) { |
| 2057 | remove_tasks_in_empty_cpuset(cp); | 2193 | mutex_lock(&callback_mutex); |
| 2058 | else | 2194 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); |
| 2059 | update_tasks_cpumask(cp, NULL); | 2195 | mutex_unlock(&callback_mutex); |
| 2060 | } | 2196 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
| 2061 | break; | 2197 | } |
| 2062 | 2198 | ||
| 2063 | case CPUSET_MEM_OFFLINE: | 2199 | /* synchronize mems_allowed to N_MEMORY */ |
| 2064 | while ((cp = cpuset_next(&queue)) != NULL) { | 2200 | if (mems_updated) { |
| 2201 | tmp_mems = top_cpuset.mems_allowed; | ||
| 2202 | mutex_lock(&callback_mutex); | ||
| 2203 | top_cpuset.mems_allowed = new_mems; | ||
| 2204 | mutex_unlock(&callback_mutex); | ||
| 2205 | update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); | ||
| 2206 | } | ||
| 2065 | 2207 | ||
| 2066 | /* Continue past cpusets with all mems online */ | 2208 | /* if cpus or mems went down, we need to propagate to descendants */ |
| 2067 | if (nodes_subset(cp->mems_allowed, | 2209 | if (cpus_offlined || mems_offlined) { |
| 2068 | node_states[N_MEMORY])) | 2210 | struct cpuset *cs; |
| 2069 | continue; | 2211 | struct cgroup *pos_cgrp; |
| 2070 | 2212 | ||
| 2071 | oldmems = cp->mems_allowed; | 2213 | rcu_read_lock(); |
| 2214 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) | ||
| 2215 | schedule_cpuset_propagate_hotplug(cs); | ||
| 2216 | rcu_read_unlock(); | ||
| 2217 | } | ||
| 2072 | 2218 | ||
| 2073 | /* Remove offline mems from this cpuset. */ | 2219 | mutex_unlock(&cpuset_mutex); |
| 2074 | mutex_lock(&callback_mutex); | ||
| 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, | ||
| 2076 | node_states[N_MEMORY]); | ||
| 2077 | mutex_unlock(&callback_mutex); | ||
| 2078 | 2220 | ||
| 2079 | /* Move tasks from the empty cpuset to a parent */ | 2221 | /* wait for propagations to finish */ |
| 2080 | if (nodes_empty(cp->mems_allowed)) | 2222 | flush_workqueue(cpuset_propagate_hotplug_wq); |
| 2081 | remove_tasks_in_empty_cpuset(cp); | 2223 | |
| 2082 | else | 2224 | /* rebuild sched domains if cpus_allowed has changed */ |
| 2083 | update_tasks_nodemask(cp, &oldmems, NULL); | 2225 | if (cpus_updated) { |
| 2084 | } | 2226 | struct sched_domain_attr *attr; |
| 2227 | cpumask_var_t *doms; | ||
| 2228 | int ndoms; | ||
| 2229 | |||
| 2230 | mutex_lock(&cpuset_mutex); | ||
| 2231 | ndoms = generate_sched_domains(&doms, &attr); | ||
| 2232 | mutex_unlock(&cpuset_mutex); | ||
| 2233 | |||
| 2234 | partition_sched_domains(ndoms, doms, attr); | ||
| 2085 | } | 2235 | } |
| 2086 | } | 2236 | } |
| 2087 | 2237 | ||
| 2088 | /* | ||
| 2089 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | ||
| 2090 | * period. This is necessary in order to make cpusets transparent | ||
| 2091 | * (of no affect) on systems that are actively using CPU hotplug | ||
| 2092 | * but making no active use of cpusets. | ||
| 2093 | * | ||
| 2094 | * The only exception to this is suspend/resume, where we don't | ||
| 2095 | * modify cpusets at all. | ||
| 2096 | * | ||
| 2097 | * This routine ensures that top_cpuset.cpus_allowed tracks | ||
| 2098 | * cpu_active_mask on each CPU hotplug (cpuhp) event. | ||
| 2099 | * | ||
| 2100 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
| 2101 | * before calling generate_sched_domains(). | ||
| 2102 | * | ||
| 2103 | * @cpu_online: Indicates whether this is a CPU online event (true) or | ||
| 2104 | * a CPU offline event (false). | ||
| 2105 | */ | ||
| 2106 | void cpuset_update_active_cpus(bool cpu_online) | 2238 | void cpuset_update_active_cpus(bool cpu_online) |
| 2107 | { | 2239 | { |
| 2108 | struct sched_domain_attr *attr; | 2240 | /* |
| 2109 | cpumask_var_t *doms; | 2241 | * We're inside cpu hotplug critical region which usually nests |
| 2110 | int ndoms; | 2242 | * inside cgroup synchronization. Bounce actual hotplug processing |
| 2111 | 2243 | * to a work item to avoid reverse locking order. | |
| 2112 | cgroup_lock(); | 2244 | * |
| 2113 | mutex_lock(&callback_mutex); | 2245 | * We still need to do partition_sched_domains() synchronously; |
| 2114 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2246 | * otherwise, the scheduler will get confused and put tasks to the |
| 2115 | mutex_unlock(&callback_mutex); | 2247 | * dead CPU. Fall back to the default single domain. |
| 2116 | 2248 | * cpuset_hotplug_workfn() will rebuild it as necessary. | |
| 2117 | if (!cpu_online) | 2249 | */ |
| 2118 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); | 2250 | partition_sched_domains(1, NULL, NULL); |
| 2119 | 2251 | schedule_work(&cpuset_hotplug_work); | |
| 2120 | ndoms = generate_sched_domains(&doms, &attr); | ||
| 2121 | cgroup_unlock(); | ||
| 2122 | |||
| 2123 | /* Have scheduler rebuild the domains */ | ||
| 2124 | partition_sched_domains(ndoms, doms, attr); | ||
| 2125 | } | 2252 | } |
| 2126 | 2253 | ||
| 2127 | #ifdef CONFIG_MEMORY_HOTPLUG | 2254 | #ifdef CONFIG_MEMORY_HOTPLUG |
| @@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
| 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2260 | static int cpuset_track_online_nodes(struct notifier_block *self, |
| 2134 | unsigned long action, void *arg) | 2261 | unsigned long action, void *arg) |
| 2135 | { | 2262 | { |
| 2136 | static nodemask_t oldmems; /* protected by cgroup_mutex */ | 2263 | schedule_work(&cpuset_hotplug_work); |
| 2137 | |||
| 2138 | cgroup_lock(); | ||
| 2139 | switch (action) { | ||
| 2140 | case MEM_ONLINE: | ||
| 2141 | oldmems = top_cpuset.mems_allowed; | ||
| 2142 | mutex_lock(&callback_mutex); | ||
| 2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; | ||
| 2144 | mutex_unlock(&callback_mutex); | ||
| 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | ||
| 2146 | break; | ||
| 2147 | case MEM_OFFLINE: | ||
| 2148 | /* | ||
| 2149 | * needn't update top_cpuset.mems_allowed explicitly because | ||
| 2150 | * scan_cpusets_upon_hotplug() will update it. | ||
| 2151 | */ | ||
| 2152 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); | ||
| 2153 | break; | ||
| 2154 | default: | ||
| 2155 | break; | ||
| 2156 | } | ||
| 2157 | cgroup_unlock(); | ||
| 2158 | |||
| 2159 | return NOTIFY_OK; | 2264 | return NOTIFY_OK; |
| 2160 | } | 2265 | } |
| 2161 | #endif | 2266 | #endif |
| @@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void) | |||
| 2173 | 2278 | ||
| 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2279 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
| 2175 | 2280 | ||
| 2176 | cpuset_wq = create_singlethread_workqueue("cpuset"); | 2281 | cpuset_propagate_hotplug_wq = |
| 2177 | BUG_ON(!cpuset_wq); | 2282 | alloc_ordered_workqueue("cpuset_hotplug", 0); |
| 2283 | BUG_ON(!cpuset_propagate_hotplug_wq); | ||
| 2178 | } | 2284 | } |
| 2179 | 2285 | ||
| 2180 | /** | 2286 | /** |
| @@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
| 2273 | */ | 2379 | */ |
| 2274 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | 2380 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) |
| 2275 | { | 2381 | { |
| 2276 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) | 2382 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
| 2277 | cs = cs->parent; | 2383 | cs = parent_cs(cs); |
| 2278 | return cs; | 2384 | return cs; |
| 2279 | } | 2385 | } |
| 2280 | 2386 | ||
| @@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | |||
| 2412 | } | 2518 | } |
| 2413 | 2519 | ||
| 2414 | /** | 2520 | /** |
| 2415 | * cpuset_unlock - release lock on cpuset changes | ||
| 2416 | * | ||
| 2417 | * Undo the lock taken in a previous cpuset_lock() call. | ||
| 2418 | */ | ||
| 2419 | |||
| 2420 | void cpuset_unlock(void) | ||
| 2421 | { | ||
| 2422 | mutex_unlock(&callback_mutex); | ||
| 2423 | } | ||
| 2424 | |||
| 2425 | /** | ||
| 2426 | * cpuset_mem_spread_node() - On which node to begin search for a file page | 2521 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
| 2427 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | 2522 | * cpuset_slab_spread_node() - On which node to begin search for a slab page |
| 2428 | * | 2523 | * |
| @@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) | |||
| 2511 | 2606 | ||
| 2512 | dentry = task_cs(tsk)->css.cgroup->dentry; | 2607 | dentry = task_cs(tsk)->css.cgroup->dentry; |
| 2513 | spin_lock(&cpuset_buffer_lock); | 2608 | spin_lock(&cpuset_buffer_lock); |
| 2514 | snprintf(cpuset_name, CPUSET_NAME_LEN, | 2609 | |
| 2515 | dentry ? (const char *)dentry->d_name.name : "/"); | 2610 | if (!dentry) { |
| 2611 | strcpy(cpuset_name, "/"); | ||
| 2612 | } else { | ||
| 2613 | spin_lock(&dentry->d_lock); | ||
| 2614 | strlcpy(cpuset_name, (const char *)dentry->d_name.name, | ||
| 2615 | CPUSET_NAME_LEN); | ||
| 2616 | spin_unlock(&dentry->d_lock); | ||
| 2617 | } | ||
| 2618 | |||
| 2516 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, | 2619 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, |
| 2517 | tsk->mems_allowed); | 2620 | tsk->mems_allowed); |
| 2518 | printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", | 2621 | printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", |
| @@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2560 | * - Used for /proc/<pid>/cpuset. | 2663 | * - Used for /proc/<pid>/cpuset. |
| 2561 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2664 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
| 2562 | * doesn't really matter if tsk->cpuset changes after we read it, | 2665 | * doesn't really matter if tsk->cpuset changes after we read it, |
| 2563 | * and we take cgroup_mutex, keeping cpuset_attach() from changing it | 2666 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
| 2564 | * anyway. | 2667 | * anyway. |
| 2565 | */ | 2668 | */ |
| 2566 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2669 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) |
| @@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
| 2582 | if (!tsk) | 2685 | if (!tsk) |
| 2583 | goto out_free; | 2686 | goto out_free; |
| 2584 | 2687 | ||
| 2585 | retval = -EINVAL; | 2688 | rcu_read_lock(); |
| 2586 | cgroup_lock(); | ||
| 2587 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2689 | css = task_subsys_state(tsk, cpuset_subsys_id); |
| 2588 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2690 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
| 2691 | rcu_read_unlock(); | ||
| 2589 | if (retval < 0) | 2692 | if (retval < 0) |
| 2590 | goto out_unlock; | 2693 | goto out_put_task; |
| 2591 | seq_puts(m, buf); | 2694 | seq_puts(m, buf); |
| 2592 | seq_putc(m, '\n'); | 2695 | seq_putc(m, '\n'); |
| 2593 | out_unlock: | 2696 | out_put_task: |
| 2594 | cgroup_unlock(); | ||
| 2595 | put_task_struct(tsk); | 2697 | put_task_struct(tsk); |
| 2596 | out_free: | 2698 | out_free: |
| 2597 | kfree(buf); | 2699 | kfree(buf); |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9a61738cefc8..c26278fd4851 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | */ | 29 | */ |
| 30 | #include <linux/pid_namespace.h> | 30 | #include <linux/pid_namespace.h> |
| 31 | #include <linux/clocksource.h> | 31 | #include <linux/clocksource.h> |
| 32 | #include <linux/serial_core.h> | ||
| 32 | #include <linux/interrupt.h> | 33 | #include <linux/interrupt.h> |
| 33 | #include <linux/spinlock.h> | 34 | #include <linux/spinlock.h> |
| 34 | #include <linux/console.h> | 35 | #include <linux/console.h> |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e064482..38573f35a5ad 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/kernel.h> | 31 | #include <linux/kernel.h> |
| 32 | #include <linux/kgdb.h> | 32 | #include <linux/kgdb.h> |
| 33 | #include <linux/kdb.h> | 33 | #include <linux/kdb.h> |
| 34 | #include <linux/serial_core.h> | ||
| 34 | #include <linux/reboot.h> | 35 | #include <linux/reboot.h> |
| 35 | #include <linux/uaccess.h> | 36 | #include <linux/uaccess.h> |
| 36 | #include <asm/cacheflush.h> | 37 | #include <asm/cacheflush.h> |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d5f8d5612f3..8875254120b6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv) | |||
| 1970 | 1970 | ||
| 1971 | kdb_printf("Module Size modstruct Used by\n"); | 1971 | kdb_printf("Module Size modstruct Used by\n"); |
| 1972 | list_for_each_entry(mod, kdb_modules, list) { | 1972 | list_for_each_entry(mod, kdb_modules, list) { |
| 1973 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 1974 | continue; | ||
| 1973 | 1975 | ||
| 1974 | kdb_printf("%-20s%8u 0x%p ", mod->name, | 1976 | kdb_printf("%-20s%8u 0x%p ", mod->name, |
| 1975 | mod->core_size, (void *)mod); | 1977 | mod->core_size, (void *)mod); |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 418b3f7053aa..d473988c1d0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
| @@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
| 106 | unsigned long long t2, t3; | 106 | unsigned long long t2, t3; |
| 107 | unsigned long flags; | 107 | unsigned long flags; |
| 108 | struct timespec ts; | 108 | struct timespec ts; |
| 109 | cputime_t utime, stime, stimescaled, utimescaled; | ||
| 109 | 110 | ||
| 110 | /* Though tsk->delays accessed later, early exit avoids | 111 | /* Though tsk->delays accessed later, early exit avoids |
| 111 | * unnecessary returning of other data | 112 | * unnecessary returning of other data |
| @@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
| 114 | goto done; | 115 | goto done; |
| 115 | 116 | ||
| 116 | tmp = (s64)d->cpu_run_real_total; | 117 | tmp = (s64)d->cpu_run_real_total; |
| 117 | cputime_to_timespec(tsk->utime + tsk->stime, &ts); | 118 | task_cputime(tsk, &utime, &stime); |
| 119 | cputime_to_timespec(utime + stime, &ts); | ||
| 118 | tmp += timespec_to_ns(&ts); | 120 | tmp += timespec_to_ns(&ts); |
| 119 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; | 121 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; |
| 120 | 122 | ||
| 121 | tmp = (s64)d->cpu_scaled_run_real_total; | 123 | tmp = (s64)d->cpu_scaled_run_real_total; |
| 122 | cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); | 124 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); |
| 125 | cputime_to_timespec(utimescaled + stimescaled, &ts); | ||
| 123 | tmp += timespec_to_ns(&ts); | 126 | tmp += timespec_to_ns(&ts); |
| 124 | d->cpu_scaled_run_real_total = | 127 | d->cpu_scaled_run_real_total = |
| 125 | (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; | 128 | (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 301079d06f24..b0cd86501c30 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 908 | } | 908 | } |
| 909 | 909 | ||
| 910 | /* | 910 | /* |
| 911 | * Initialize event state based on the perf_event_attr::disabled. | ||
| 912 | */ | ||
| 913 | static inline void perf_event__state_init(struct perf_event *event) | ||
| 914 | { | ||
| 915 | event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : | ||
| 916 | PERF_EVENT_STATE_INACTIVE; | ||
| 917 | } | ||
| 918 | |||
| 919 | /* | ||
| 911 | * Called at perf_event creation and when events are attached/detached from a | 920 | * Called at perf_event creation and when events are attached/detached from a |
| 912 | * group. | 921 | * group. |
| 913 | */ | 922 | */ |
| @@ -3682,7 +3691,7 @@ unlock: | |||
| 3682 | 3691 | ||
| 3683 | static int perf_fasync(int fd, struct file *filp, int on) | 3692 | static int perf_fasync(int fd, struct file *filp, int on) |
| 3684 | { | 3693 | { |
| 3685 | struct inode *inode = filp->f_path.dentry->d_inode; | 3694 | struct inode *inode = file_inode(filp); |
| 3686 | struct perf_event *event = filp->private_data; | 3695 | struct perf_event *event = filp->private_data; |
| 3687 | int retval; | 3696 | int retval; |
| 3688 | 3697 | ||
| @@ -5117,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 5117 | { | 5126 | { |
| 5118 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5127 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
| 5119 | struct perf_event *event; | 5128 | struct perf_event *event; |
| 5120 | struct hlist_node *node; | ||
| 5121 | struct hlist_head *head; | 5129 | struct hlist_head *head; |
| 5122 | 5130 | ||
| 5123 | rcu_read_lock(); | 5131 | rcu_read_lock(); |
| @@ -5125,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 5125 | if (!head) | 5133 | if (!head) |
| 5126 | goto end; | 5134 | goto end; |
| 5127 | 5135 | ||
| 5128 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5136 | hlist_for_each_entry_rcu(event, head, hlist_entry) { |
| 5129 | if (perf_swevent_match(event, type, event_id, data, regs)) | 5137 | if (perf_swevent_match(event, type, event_id, data, regs)) |
| 5130 | perf_swevent_event(event, nr, data, regs); | 5138 | perf_swevent_event(event, nr, data, regs); |
| 5131 | } | 5139 | } |
| @@ -5410,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
| 5410 | { | 5418 | { |
| 5411 | struct perf_sample_data data; | 5419 | struct perf_sample_data data; |
| 5412 | struct perf_event *event; | 5420 | struct perf_event *event; |
| 5413 | struct hlist_node *node; | ||
| 5414 | 5421 | ||
| 5415 | struct perf_raw_record raw = { | 5422 | struct perf_raw_record raw = { |
| 5416 | .size = entry_size, | 5423 | .size = entry_size, |
| @@ -5420,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
| 5420 | perf_sample_data_init(&data, addr, 0); | 5427 | perf_sample_data_init(&data, addr, 0); |
| 5421 | data.raw = &raw; | 5428 | data.raw = &raw; |
| 5422 | 5429 | ||
| 5423 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5430 | hlist_for_each_entry_rcu(event, head, hlist_entry) { |
| 5424 | if (perf_tp_event_match(event, &data, regs)) | 5431 | if (perf_tp_event_match(event, &data, regs)) |
| 5425 | perf_swevent_event(event, count, &data, regs); | 5432 | perf_swevent_event(event, count, &data, regs); |
| 5426 | } | 5433 | } |
| @@ -5956,13 +5963,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type) | |||
| 5956 | pmu->name = name; | 5963 | pmu->name = name; |
| 5957 | 5964 | ||
| 5958 | if (type < 0) { | 5965 | if (type < 0) { |
| 5959 | int err = idr_pre_get(&pmu_idr, GFP_KERNEL); | 5966 | type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); |
| 5960 | if (!err) | 5967 | if (type < 0) { |
| 5961 | goto free_pdc; | 5968 | ret = type; |
| 5962 | |||
| 5963 | err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); | ||
| 5964 | if (err) { | ||
| 5965 | ret = err; | ||
| 5966 | goto free_pdc; | 5969 | goto free_pdc; |
| 5967 | } | 5970 | } |
| 5968 | } | 5971 | } |
| @@ -6162,11 +6165,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6162 | 6165 | ||
| 6163 | if (task) { | 6166 | if (task) { |
| 6164 | event->attach_state = PERF_ATTACH_TASK; | 6167 | event->attach_state = PERF_ATTACH_TASK; |
| 6168 | |||
| 6169 | if (attr->type == PERF_TYPE_TRACEPOINT) | ||
| 6170 | event->hw.tp_target = task; | ||
| 6165 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 6171 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
| 6166 | /* | 6172 | /* |
| 6167 | * hw_breakpoint is a bit difficult here.. | 6173 | * hw_breakpoint is a bit difficult here.. |
| 6168 | */ | 6174 | */ |
| 6169 | if (attr->type == PERF_TYPE_BREAKPOINT) | 6175 | else if (attr->type == PERF_TYPE_BREAKPOINT) |
| 6170 | event->hw.bp_target = task; | 6176 | event->hw.bp_target = task; |
| 6171 | #endif | 6177 | #endif |
| 6172 | } | 6178 | } |
| @@ -6179,8 +6185,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6179 | event->overflow_handler = overflow_handler; | 6185 | event->overflow_handler = overflow_handler; |
| 6180 | event->overflow_handler_context = context; | 6186 | event->overflow_handler_context = context; |
| 6181 | 6187 | ||
| 6182 | if (attr->disabled) | 6188 | perf_event__state_init(event); |
| 6183 | event->state = PERF_EVENT_STATE_OFF; | ||
| 6184 | 6189 | ||
| 6185 | pmu = NULL; | 6190 | pmu = NULL; |
| 6186 | 6191 | ||
| @@ -6609,9 +6614,17 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6609 | 6614 | ||
| 6610 | mutex_lock(&gctx->mutex); | 6615 | mutex_lock(&gctx->mutex); |
| 6611 | perf_remove_from_context(group_leader); | 6616 | perf_remove_from_context(group_leader); |
| 6617 | |||
| 6618 | /* | ||
| 6619 | * Removing from the context ends up with disabled | ||
| 6620 | * event. What we want here is event in the initial | ||
| 6621 | * startup state, ready to be add into new context. | ||
| 6622 | */ | ||
| 6623 | perf_event__state_init(group_leader); | ||
| 6612 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6624 | list_for_each_entry(sibling, &group_leader->sibling_list, |
| 6613 | group_entry) { | 6625 | group_entry) { |
| 6614 | perf_remove_from_context(sibling); | 6626 | perf_remove_from_context(sibling); |
| 6627 | perf_event__state_init(sibling); | ||
| 6615 | put_ctx(gctx); | 6628 | put_ctx(gctx); |
| 6616 | } | 6629 | } |
| 6617 | mutex_unlock(&gctx->mutex); | 6630 | mutex_unlock(&gctx->mutex); |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fe8a916507ed..a64f8aeb5c1f 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
| @@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void) | |||
| 676 | err_alloc: | 676 | err_alloc: |
| 677 | for_each_possible_cpu(err_cpu) { | 677 | for_each_possible_cpu(err_cpu) { |
| 678 | for (i = 0; i < TYPE_MAX; i++) | 678 | for (i = 0; i < TYPE_MAX; i++) |
| 679 | kfree(per_cpu(nr_task_bp_pinned[i], cpu)); | 679 | kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); |
| 680 | if (err_cpu == cpu) | 680 | if (err_cpu == cpu) |
| 681 | break; | 681 | break; |
| 682 | } | 682 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dea7acfbb071..a567c8c7ef31 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/pagemap.h> /* read_mapping_page */ | 27 | #include <linux/pagemap.h> /* read_mapping_page */ |
| 28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
| 29 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
| 30 | #include <linux/export.h> | ||
| 30 | #include <linux/rmap.h> /* anon_vma_prepare */ | 31 | #include <linux/rmap.h> /* anon_vma_prepare */ |
| 31 | #include <linux/mmu_notifier.h> /* set_pte_at_notify */ | 32 | #include <linux/mmu_notifier.h> /* set_pte_at_notify */ |
| 32 | #include <linux/swap.h> /* try_to_free_swap */ | 33 | #include <linux/swap.h> /* try_to_free_swap */ |
| @@ -41,58 +42,31 @@ | |||
| 41 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE | 42 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE |
| 42 | 43 | ||
| 43 | static struct rb_root uprobes_tree = RB_ROOT; | 44 | static struct rb_root uprobes_tree = RB_ROOT; |
| 44 | |||
| 45 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ | ||
| 46 | |||
| 47 | #define UPROBES_HASH_SZ 13 | ||
| 48 | |||
| 49 | /* | 45 | /* |
| 50 | * We need separate register/unregister and mmap/munmap lock hashes because | 46 | * allows us to skip the uprobe_mmap if there are no uprobe events active |
| 51 | * of mmap_sem nesting. | 47 | * at this time. Probably a fine grained per inode count is better? |
| 52 | * | ||
| 53 | * uprobe_register() needs to install probes on (potentially) all processes | ||
| 54 | * and thus needs to acquire multiple mmap_sems (consequtively, not | ||
| 55 | * concurrently), whereas uprobe_mmap() is called while holding mmap_sem | ||
| 56 | * for the particular process doing the mmap. | ||
| 57 | * | ||
| 58 | * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem | ||
| 59 | * because of lock order against i_mmap_mutex. This means there's a hole in | ||
| 60 | * the register vma iteration where a mmap() can happen. | ||
| 61 | * | ||
| 62 | * Thus uprobe_register() can race with uprobe_mmap() and we can try and | ||
| 63 | * install a probe where one is already installed. | ||
| 64 | */ | 48 | */ |
| 49 | #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) | ||
| 65 | 50 | ||
| 66 | /* serialize (un)register */ | 51 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ |
| 67 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | ||
| 68 | |||
| 69 | #define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | ||
| 70 | 52 | ||
| 53 | #define UPROBES_HASH_SZ 13 | ||
| 71 | /* serialize uprobe->pending_list */ | 54 | /* serialize uprobe->pending_list */ |
| 72 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | 55 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; |
| 73 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | 56 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) |
| 74 | 57 | ||
| 75 | static struct percpu_rw_semaphore dup_mmap_sem; | 58 | static struct percpu_rw_semaphore dup_mmap_sem; |
| 76 | 59 | ||
| 77 | /* | ||
| 78 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe | ||
| 79 | * events active at this time. Probably a fine grained per inode count is | ||
| 80 | * better? | ||
| 81 | */ | ||
| 82 | static atomic_t uprobe_events = ATOMIC_INIT(0); | ||
| 83 | |||
| 84 | /* Have a copy of original instruction */ | 60 | /* Have a copy of original instruction */ |
| 85 | #define UPROBE_COPY_INSN 0 | 61 | #define UPROBE_COPY_INSN 0 |
| 86 | /* Dont run handlers when first register/ last unregister in progress*/ | ||
| 87 | #define UPROBE_RUN_HANDLER 1 | ||
| 88 | /* Can skip singlestep */ | 62 | /* Can skip singlestep */ |
| 89 | #define UPROBE_SKIP_SSTEP 2 | 63 | #define UPROBE_SKIP_SSTEP 1 |
| 90 | 64 | ||
| 91 | struct uprobe { | 65 | struct uprobe { |
| 92 | struct rb_node rb_node; /* node in the rb tree */ | 66 | struct rb_node rb_node; /* node in the rb tree */ |
| 93 | atomic_t ref; | 67 | atomic_t ref; |
| 68 | struct rw_semaphore register_rwsem; | ||
| 94 | struct rw_semaphore consumer_rwsem; | 69 | struct rw_semaphore consumer_rwsem; |
| 95 | struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ | ||
| 96 | struct list_head pending_list; | 70 | struct list_head pending_list; |
| 97 | struct uprobe_consumer *consumers; | 71 | struct uprobe_consumer *consumers; |
| 98 | struct inode *inode; /* Also hold a ref to inode */ | 72 | struct inode *inode; /* Also hold a ref to inode */ |
| @@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) | |||
| 430 | u = __insert_uprobe(uprobe); | 404 | u = __insert_uprobe(uprobe); |
| 431 | spin_unlock(&uprobes_treelock); | 405 | spin_unlock(&uprobes_treelock); |
| 432 | 406 | ||
| 433 | /* For now assume that the instruction need not be single-stepped */ | ||
| 434 | __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); | ||
| 435 | |||
| 436 | return u; | 407 | return u; |
| 437 | } | 408 | } |
| 438 | 409 | ||
| @@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
| 452 | 423 | ||
| 453 | uprobe->inode = igrab(inode); | 424 | uprobe->inode = igrab(inode); |
| 454 | uprobe->offset = offset; | 425 | uprobe->offset = offset; |
| 426 | init_rwsem(&uprobe->register_rwsem); | ||
| 455 | init_rwsem(&uprobe->consumer_rwsem); | 427 | init_rwsem(&uprobe->consumer_rwsem); |
| 456 | mutex_init(&uprobe->copy_mutex); | 428 | /* For now assume that the instruction need not be single-stepped */ |
| 429 | __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); | ||
| 457 | 430 | ||
| 458 | /* add to uprobes_tree, sorted on inode:offset */ | 431 | /* add to uprobes_tree, sorted on inode:offset */ |
| 459 | cur_uprobe = insert_uprobe(uprobe); | 432 | cur_uprobe = insert_uprobe(uprobe); |
| @@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
| 463 | kfree(uprobe); | 436 | kfree(uprobe); |
| 464 | uprobe = cur_uprobe; | 437 | uprobe = cur_uprobe; |
| 465 | iput(inode); | 438 | iput(inode); |
| 466 | } else { | ||
| 467 | atomic_inc(&uprobe_events); | ||
| 468 | } | 439 | } |
| 469 | 440 | ||
| 470 | return uprobe; | 441 | return uprobe; |
| 471 | } | 442 | } |
| 472 | 443 | ||
| 473 | static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | 444 | static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) |
| 474 | { | ||
| 475 | struct uprobe_consumer *uc; | ||
| 476 | |||
| 477 | if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) | ||
| 478 | return; | ||
| 479 | |||
| 480 | down_read(&uprobe->consumer_rwsem); | ||
| 481 | for (uc = uprobe->consumers; uc; uc = uc->next) { | ||
| 482 | if (!uc->filter || uc->filter(uc, current)) | ||
| 483 | uc->handler(uc, regs); | ||
| 484 | } | ||
| 485 | up_read(&uprobe->consumer_rwsem); | ||
| 486 | } | ||
| 487 | |||
| 488 | /* Returns the previous consumer */ | ||
| 489 | static struct uprobe_consumer * | ||
| 490 | consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) | ||
| 491 | { | 445 | { |
| 492 | down_write(&uprobe->consumer_rwsem); | 446 | down_write(&uprobe->consumer_rwsem); |
| 493 | uc->next = uprobe->consumers; | 447 | uc->next = uprobe->consumers; |
| 494 | uprobe->consumers = uc; | 448 | uprobe->consumers = uc; |
| 495 | up_write(&uprobe->consumer_rwsem); | 449 | up_write(&uprobe->consumer_rwsem); |
| 496 | |||
| 497 | return uc->next; | ||
| 498 | } | 450 | } |
| 499 | 451 | ||
| 500 | /* | 452 | /* |
| @@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
| 588 | if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) | 540 | if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) |
| 589 | return ret; | 541 | return ret; |
| 590 | 542 | ||
| 591 | mutex_lock(&uprobe->copy_mutex); | 543 | /* TODO: move this into _register, until then we abuse this sem. */ |
| 544 | down_write(&uprobe->consumer_rwsem); | ||
| 592 | if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) | 545 | if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) |
| 593 | goto out; | 546 | goto out; |
| 594 | 547 | ||
| @@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
| 612 | set_bit(UPROBE_COPY_INSN, &uprobe->flags); | 565 | set_bit(UPROBE_COPY_INSN, &uprobe->flags); |
| 613 | 566 | ||
| 614 | out: | 567 | out: |
| 615 | mutex_unlock(&uprobe->copy_mutex); | 568 | up_write(&uprobe->consumer_rwsem); |
| 569 | |||
| 570 | return ret; | ||
| 571 | } | ||
| 572 | |||
| 573 | static inline bool consumer_filter(struct uprobe_consumer *uc, | ||
| 574 | enum uprobe_filter_ctx ctx, struct mm_struct *mm) | ||
| 575 | { | ||
| 576 | return !uc->filter || uc->filter(uc, ctx, mm); | ||
| 577 | } | ||
| 578 | |||
| 579 | static bool filter_chain(struct uprobe *uprobe, | ||
| 580 | enum uprobe_filter_ctx ctx, struct mm_struct *mm) | ||
| 581 | { | ||
| 582 | struct uprobe_consumer *uc; | ||
| 583 | bool ret = false; | ||
| 584 | |||
| 585 | down_read(&uprobe->consumer_rwsem); | ||
| 586 | for (uc = uprobe->consumers; uc; uc = uc->next) { | ||
| 587 | ret = consumer_filter(uc, ctx, mm); | ||
| 588 | if (ret) | ||
| 589 | break; | ||
| 590 | } | ||
| 591 | up_read(&uprobe->consumer_rwsem); | ||
| 616 | 592 | ||
| 617 | return ret; | 593 | return ret; |
| 618 | } | 594 | } |
| @@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 624 | bool first_uprobe; | 600 | bool first_uprobe; |
| 625 | int ret; | 601 | int ret; |
| 626 | 602 | ||
| 627 | /* | ||
| 628 | * If probe is being deleted, unregister thread could be done with | ||
| 629 | * the vma-rmap-walk through. Adding a probe now can be fatal since | ||
| 630 | * nobody will be able to cleanup. Also we could be from fork or | ||
| 631 | * mremap path, where the probe might have already been inserted. | ||
| 632 | * Hence behave as if probe already existed. | ||
| 633 | */ | ||
| 634 | if (!uprobe->consumers) | ||
| 635 | return 0; | ||
| 636 | |||
| 637 | ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); | 603 | ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); |
| 638 | if (ret) | 604 | if (ret) |
| 639 | return ret; | 605 | return ret; |
| @@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 658 | static int | 624 | static int |
| 659 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) | 625 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) |
| 660 | { | 626 | { |
| 661 | /* can happen if uprobe_register() fails */ | ||
| 662 | if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) | ||
| 663 | return 0; | ||
| 664 | |||
| 665 | set_bit(MMF_RECALC_UPROBES, &mm->flags); | 627 | set_bit(MMF_RECALC_UPROBES, &mm->flags); |
| 666 | return set_orig_insn(&uprobe->arch, mm, vaddr); | 628 | return set_orig_insn(&uprobe->arch, mm, vaddr); |
| 667 | } | 629 | } |
| 668 | 630 | ||
| 631 | static inline bool uprobe_is_active(struct uprobe *uprobe) | ||
| 632 | { | ||
| 633 | return !RB_EMPTY_NODE(&uprobe->rb_node); | ||
| 634 | } | ||
| 669 | /* | 635 | /* |
| 670 | * There could be threads that have already hit the breakpoint. They | 636 | * There could be threads that have already hit the breakpoint. They |
| 671 | * will recheck the current insn and restart if find_uprobe() fails. | 637 | * will recheck the current insn and restart if find_uprobe() fails. |
| @@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad | |||
| 673 | */ | 639 | */ |
| 674 | static void delete_uprobe(struct uprobe *uprobe) | 640 | static void delete_uprobe(struct uprobe *uprobe) |
| 675 | { | 641 | { |
| 642 | if (WARN_ON(!uprobe_is_active(uprobe))) | ||
| 643 | return; | ||
| 644 | |||
| 676 | spin_lock(&uprobes_treelock); | 645 | spin_lock(&uprobes_treelock); |
| 677 | rb_erase(&uprobe->rb_node, &uprobes_tree); | 646 | rb_erase(&uprobe->rb_node, &uprobes_tree); |
| 678 | spin_unlock(&uprobes_treelock); | 647 | spin_unlock(&uprobes_treelock); |
| 648 | RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ | ||
| 679 | iput(uprobe->inode); | 649 | iput(uprobe->inode); |
| 680 | put_uprobe(uprobe); | 650 | put_uprobe(uprobe); |
| 681 | atomic_dec(&uprobe_events); | ||
| 682 | } | 651 | } |
| 683 | 652 | ||
| 684 | struct map_info { | 653 | struct map_info { |
| @@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 764 | return curr; | 733 | return curr; |
| 765 | } | 734 | } |
| 766 | 735 | ||
| 767 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | 736 | static int |
| 737 | register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) | ||
| 768 | { | 738 | { |
| 739 | bool is_register = !!new; | ||
| 769 | struct map_info *info; | 740 | struct map_info *info; |
| 770 | int err = 0; | 741 | int err = 0; |
| 771 | 742 | ||
| @@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
| 794 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) | 765 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) |
| 795 | goto unlock; | 766 | goto unlock; |
| 796 | 767 | ||
| 797 | if (is_register) | 768 | if (is_register) { |
| 798 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); | 769 | /* consult only the "caller", new consumer. */ |
| 799 | else | 770 | if (consumer_filter(new, |
| 800 | err |= remove_breakpoint(uprobe, mm, info->vaddr); | 771 | UPROBE_FILTER_REGISTER, mm)) |
| 772 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); | ||
| 773 | } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { | ||
| 774 | if (!filter_chain(uprobe, | ||
| 775 | UPROBE_FILTER_UNREGISTER, mm)) | ||
| 776 | err |= remove_breakpoint(uprobe, mm, info->vaddr); | ||
| 777 | } | ||
| 801 | 778 | ||
| 802 | unlock: | 779 | unlock: |
| 803 | up_write(&mm->mmap_sem); | 780 | up_write(&mm->mmap_sem); |
| @@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
| 810 | return err; | 787 | return err; |
| 811 | } | 788 | } |
| 812 | 789 | ||
| 813 | static int __uprobe_register(struct uprobe *uprobe) | 790 | static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) |
| 814 | { | 791 | { |
| 815 | return register_for_each_vma(uprobe, true); | 792 | consumer_add(uprobe, uc); |
| 793 | return register_for_each_vma(uprobe, uc); | ||
| 816 | } | 794 | } |
| 817 | 795 | ||
| 818 | static void __uprobe_unregister(struct uprobe *uprobe) | 796 | static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) |
| 819 | { | 797 | { |
| 820 | if (!register_for_each_vma(uprobe, false)) | 798 | int err; |
| 821 | delete_uprobe(uprobe); | 799 | |
| 800 | if (!consumer_del(uprobe, uc)) /* WARN? */ | ||
| 801 | return; | ||
| 822 | 802 | ||
| 803 | err = register_for_each_vma(uprobe, NULL); | ||
| 823 | /* TODO : cant unregister? schedule a worker thread */ | 804 | /* TODO : cant unregister? schedule a worker thread */ |
| 805 | if (!uprobe->consumers && !err) | ||
| 806 | delete_uprobe(uprobe); | ||
| 824 | } | 807 | } |
| 825 | 808 | ||
| 826 | /* | 809 | /* |
| @@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * | |||
| 845 | struct uprobe *uprobe; | 828 | struct uprobe *uprobe; |
| 846 | int ret; | 829 | int ret; |
| 847 | 830 | ||
| 848 | if (!inode || !uc || uc->next) | 831 | /* Racy, just to catch the obvious mistakes */ |
| 849 | return -EINVAL; | ||
| 850 | |||
| 851 | if (offset > i_size_read(inode)) | 832 | if (offset > i_size_read(inode)) |
| 852 | return -EINVAL; | 833 | return -EINVAL; |
| 853 | 834 | ||
| 854 | ret = 0; | 835 | retry: |
| 855 | mutex_lock(uprobes_hash(inode)); | ||
| 856 | uprobe = alloc_uprobe(inode, offset); | 836 | uprobe = alloc_uprobe(inode, offset); |
| 857 | 837 | if (!uprobe) | |
| 858 | if (!uprobe) { | 838 | return -ENOMEM; |
| 859 | ret = -ENOMEM; | 839 | /* |
| 860 | } else if (!consumer_add(uprobe, uc)) { | 840 | * We can race with uprobe_unregister()->delete_uprobe(). |
| 861 | ret = __uprobe_register(uprobe); | 841 | * Check uprobe_is_active() and retry if it is false. |
| 862 | if (ret) { | 842 | */ |
| 863 | uprobe->consumers = NULL; | 843 | down_write(&uprobe->register_rwsem); |
| 864 | __uprobe_unregister(uprobe); | 844 | ret = -EAGAIN; |
| 865 | } else { | 845 | if (likely(uprobe_is_active(uprobe))) { |
| 866 | set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); | 846 | ret = __uprobe_register(uprobe, uc); |
| 867 | } | 847 | if (ret) |
| 848 | __uprobe_unregister(uprobe, uc); | ||
| 868 | } | 849 | } |
| 850 | up_write(&uprobe->register_rwsem); | ||
| 851 | put_uprobe(uprobe); | ||
| 869 | 852 | ||
| 870 | mutex_unlock(uprobes_hash(inode)); | 853 | if (unlikely(ret == -EAGAIN)) |
| 871 | if (uprobe) | 854 | goto retry; |
| 872 | put_uprobe(uprobe); | 855 | return ret; |
| 856 | } | ||
| 857 | EXPORT_SYMBOL_GPL(uprobe_register); | ||
| 858 | |||
| 859 | /* | ||
| 860 | * uprobe_apply - unregister a already registered probe. | ||
| 861 | * @inode: the file in which the probe has to be removed. | ||
| 862 | * @offset: offset from the start of the file. | ||
| 863 | * @uc: consumer which wants to add more or remove some breakpoints | ||
| 864 | * @add: add or remove the breakpoints | ||
| 865 | */ | ||
| 866 | int uprobe_apply(struct inode *inode, loff_t offset, | ||
| 867 | struct uprobe_consumer *uc, bool add) | ||
| 868 | { | ||
| 869 | struct uprobe *uprobe; | ||
| 870 | struct uprobe_consumer *con; | ||
| 871 | int ret = -ENOENT; | ||
| 872 | |||
| 873 | uprobe = find_uprobe(inode, offset); | ||
| 874 | if (!uprobe) | ||
| 875 | return ret; | ||
| 876 | |||
| 877 | down_write(&uprobe->register_rwsem); | ||
| 878 | for (con = uprobe->consumers; con && con != uc ; con = con->next) | ||
| 879 | ; | ||
| 880 | if (con) | ||
| 881 | ret = register_for_each_vma(uprobe, add ? uc : NULL); | ||
| 882 | up_write(&uprobe->register_rwsem); | ||
| 883 | put_uprobe(uprobe); | ||
| 873 | 884 | ||
| 874 | return ret; | 885 | return ret; |
| 875 | } | 886 | } |
| @@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume | |||
| 884 | { | 895 | { |
| 885 | struct uprobe *uprobe; | 896 | struct uprobe *uprobe; |
| 886 | 897 | ||
| 887 | if (!inode || !uc) | ||
| 888 | return; | ||
| 889 | |||
| 890 | uprobe = find_uprobe(inode, offset); | 898 | uprobe = find_uprobe(inode, offset); |
| 891 | if (!uprobe) | 899 | if (!uprobe) |
| 892 | return; | 900 | return; |
| 893 | 901 | ||
| 894 | mutex_lock(uprobes_hash(inode)); | 902 | down_write(&uprobe->register_rwsem); |
| 903 | __uprobe_unregister(uprobe, uc); | ||
| 904 | up_write(&uprobe->register_rwsem); | ||
| 905 | put_uprobe(uprobe); | ||
| 906 | } | ||
| 907 | EXPORT_SYMBOL_GPL(uprobe_unregister); | ||
| 895 | 908 | ||
| 896 | if (consumer_del(uprobe, uc)) { | 909 | static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) |
| 897 | if (!uprobe->consumers) { | 910 | { |
| 898 | __uprobe_unregister(uprobe); | 911 | struct vm_area_struct *vma; |
| 899 | clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); | 912 | int err = 0; |
| 900 | } | 913 | |
| 914 | down_read(&mm->mmap_sem); | ||
| 915 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 916 | unsigned long vaddr; | ||
| 917 | loff_t offset; | ||
| 918 | |||
| 919 | if (!valid_vma(vma, false) || | ||
| 920 | vma->vm_file->f_mapping->host != uprobe->inode) | ||
| 921 | continue; | ||
| 922 | |||
| 923 | offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; | ||
| 924 | if (uprobe->offset < offset || | ||
| 925 | uprobe->offset >= offset + vma->vm_end - vma->vm_start) | ||
| 926 | continue; | ||
| 927 | |||
| 928 | vaddr = offset_to_vaddr(vma, uprobe->offset); | ||
| 929 | err |= remove_breakpoint(uprobe, mm, vaddr); | ||
| 901 | } | 930 | } |
| 931 | up_read(&mm->mmap_sem); | ||
| 902 | 932 | ||
| 903 | mutex_unlock(uprobes_hash(inode)); | 933 | return err; |
| 904 | if (uprobe) | ||
| 905 | put_uprobe(uprobe); | ||
| 906 | } | 934 | } |
| 907 | 935 | ||
| 908 | static struct rb_node * | 936 | static struct rb_node * |
| @@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 979 | struct uprobe *uprobe, *u; | 1007 | struct uprobe *uprobe, *u; |
| 980 | struct inode *inode; | 1008 | struct inode *inode; |
| 981 | 1009 | ||
| 982 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) | 1010 | if (no_uprobe_events() || !valid_vma(vma, true)) |
| 983 | return 0; | 1011 | return 0; |
| 984 | 1012 | ||
| 985 | inode = vma->vm_file->f_mapping->host; | 1013 | inode = vma->vm_file->f_mapping->host; |
| @@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 988 | 1016 | ||
| 989 | mutex_lock(uprobes_mmap_hash(inode)); | 1017 | mutex_lock(uprobes_mmap_hash(inode)); |
| 990 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); | 1018 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); |
| 991 | 1019 | /* | |
| 1020 | * We can race with uprobe_unregister(), this uprobe can be already | ||
| 1021 | * removed. But in this case filter_chain() must return false, all | ||
| 1022 | * consumers have gone away. | ||
| 1023 | */ | ||
| 992 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 1024 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { |
| 993 | if (!fatal_signal_pending(current)) { | 1025 | if (!fatal_signal_pending(current) && |
| 1026 | filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { | ||
| 994 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); | 1027 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); |
| 995 | install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | 1028 | install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); |
| 996 | } | 1029 | } |
| @@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e | |||
| 1025 | */ | 1058 | */ |
| 1026 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) | 1059 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) |
| 1027 | { | 1060 | { |
| 1028 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | 1061 | if (no_uprobe_events() || !valid_vma(vma, false)) |
| 1029 | return; | 1062 | return; |
| 1030 | 1063 | ||
| 1031 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ | 1064 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ |
| @@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
| 1042 | /* Slot allocation for XOL */ | 1075 | /* Slot allocation for XOL */ |
| 1043 | static int xol_add_vma(struct xol_area *area) | 1076 | static int xol_add_vma(struct xol_area *area) |
| 1044 | { | 1077 | { |
| 1045 | struct mm_struct *mm; | 1078 | struct mm_struct *mm = current->mm; |
| 1046 | int ret; | 1079 | int ret = -EALREADY; |
| 1047 | |||
| 1048 | area->page = alloc_page(GFP_HIGHUSER); | ||
| 1049 | if (!area->page) | ||
| 1050 | return -ENOMEM; | ||
| 1051 | |||
| 1052 | ret = -EALREADY; | ||
| 1053 | mm = current->mm; | ||
| 1054 | 1080 | ||
| 1055 | down_write(&mm->mmap_sem); | 1081 | down_write(&mm->mmap_sem); |
| 1056 | if (mm->uprobes_state.xol_area) | 1082 | if (mm->uprobes_state.xol_area) |
| 1057 | goto fail; | 1083 | goto fail; |
| 1058 | 1084 | ||
| 1059 | ret = -ENOMEM; | 1085 | ret = -ENOMEM; |
| 1060 | |||
| 1061 | /* Try to map as high as possible, this is only a hint. */ | 1086 | /* Try to map as high as possible, this is only a hint. */ |
| 1062 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | 1087 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); |
| 1063 | if (area->vaddr & ~PAGE_MASK) { | 1088 | if (area->vaddr & ~PAGE_MASK) { |
| @@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area) | |||
| 1073 | smp_wmb(); /* pairs with get_xol_area() */ | 1098 | smp_wmb(); /* pairs with get_xol_area() */ |
| 1074 | mm->uprobes_state.xol_area = area; | 1099 | mm->uprobes_state.xol_area = area; |
| 1075 | ret = 0; | 1100 | ret = 0; |
| 1076 | 1101 | fail: | |
| 1077 | fail: | ||
| 1078 | up_write(&mm->mmap_sem); | 1102 | up_write(&mm->mmap_sem); |
| 1079 | if (ret) | ||
| 1080 | __free_page(area->page); | ||
| 1081 | 1103 | ||
| 1082 | return ret; | 1104 | return ret; |
| 1083 | } | 1105 | } |
| 1084 | 1106 | ||
| 1085 | static struct xol_area *get_xol_area(struct mm_struct *mm) | ||
| 1086 | { | ||
| 1087 | struct xol_area *area; | ||
| 1088 | |||
| 1089 | area = mm->uprobes_state.xol_area; | ||
| 1090 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
| 1091 | |||
| 1092 | return area; | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | /* | 1107 | /* |
| 1096 | * xol_alloc_area - Allocate process's xol_area. | 1108 | * get_xol_area - Allocate process's xol_area if necessary. |
| 1097 | * This area will be used for storing instructions for execution out of | 1109 | * This area will be used for storing instructions for execution out of line. |
| 1098 | * line. | ||
| 1099 | * | 1110 | * |
| 1100 | * Returns the allocated area or NULL. | 1111 | * Returns the allocated area or NULL. |
| 1101 | */ | 1112 | */ |
| 1102 | static struct xol_area *xol_alloc_area(void) | 1113 | static struct xol_area *get_xol_area(void) |
| 1103 | { | 1114 | { |
| 1115 | struct mm_struct *mm = current->mm; | ||
| 1104 | struct xol_area *area; | 1116 | struct xol_area *area; |
| 1105 | 1117 | ||
| 1118 | area = mm->uprobes_state.xol_area; | ||
| 1119 | if (area) | ||
| 1120 | goto ret; | ||
| 1121 | |||
| 1106 | area = kzalloc(sizeof(*area), GFP_KERNEL); | 1122 | area = kzalloc(sizeof(*area), GFP_KERNEL); |
| 1107 | if (unlikely(!area)) | 1123 | if (unlikely(!area)) |
| 1108 | return NULL; | 1124 | goto out; |
| 1109 | 1125 | ||
| 1110 | area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); | 1126 | area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); |
| 1111 | |||
| 1112 | if (!area->bitmap) | 1127 | if (!area->bitmap) |
| 1113 | goto fail; | 1128 | goto free_area; |
| 1129 | |||
| 1130 | area->page = alloc_page(GFP_HIGHUSER); | ||
| 1131 | if (!area->page) | ||
| 1132 | goto free_bitmap; | ||
| 1114 | 1133 | ||
| 1115 | init_waitqueue_head(&area->wq); | 1134 | init_waitqueue_head(&area->wq); |
| 1116 | if (!xol_add_vma(area)) | 1135 | if (!xol_add_vma(area)) |
| 1117 | return area; | 1136 | return area; |
| 1118 | 1137 | ||
| 1119 | fail: | 1138 | __free_page(area->page); |
| 1139 | free_bitmap: | ||
| 1120 | kfree(area->bitmap); | 1140 | kfree(area->bitmap); |
| 1141 | free_area: | ||
| 1121 | kfree(area); | 1142 | kfree(area); |
| 1122 | 1143 | out: | |
| 1123 | return get_xol_area(current->mm); | 1144 | area = mm->uprobes_state.xol_area; |
| 1145 | ret: | ||
| 1146 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
| 1147 | return area; | ||
| 1124 | } | 1148 | } |
| 1125 | 1149 | ||
| 1126 | /* | 1150 | /* |
| @@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) | |||
| 1186 | } | 1210 | } |
| 1187 | 1211 | ||
| 1188 | /* | 1212 | /* |
| 1189 | * xol_get_insn_slot - If was not allocated a slot, then | 1213 | * xol_get_insn_slot - allocate a slot for xol. |
| 1190 | * allocate a slot. | ||
| 1191 | * Returns the allocated slot address or 0. | 1214 | * Returns the allocated slot address or 0. |
| 1192 | */ | 1215 | */ |
| 1193 | static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) | 1216 | static unsigned long xol_get_insn_slot(struct uprobe *uprobe) |
| 1194 | { | 1217 | { |
| 1195 | struct xol_area *area; | 1218 | struct xol_area *area; |
| 1196 | unsigned long offset; | 1219 | unsigned long offset; |
| 1220 | unsigned long xol_vaddr; | ||
| 1197 | void *vaddr; | 1221 | void *vaddr; |
| 1198 | 1222 | ||
| 1199 | area = get_xol_area(current->mm); | 1223 | area = get_xol_area(); |
| 1200 | if (!area) { | 1224 | if (!area) |
| 1201 | area = xol_alloc_area(); | 1225 | return 0; |
| 1202 | if (!area) | ||
| 1203 | return 0; | ||
| 1204 | } | ||
| 1205 | current->utask->xol_vaddr = xol_take_insn_slot(area); | ||
| 1206 | 1226 | ||
| 1207 | /* | 1227 | xol_vaddr = xol_take_insn_slot(area); |
| 1208 | * Initialize the slot if xol_vaddr points to valid | 1228 | if (unlikely(!xol_vaddr)) |
| 1209 | * instruction slot. | ||
| 1210 | */ | ||
| 1211 | if (unlikely(!current->utask->xol_vaddr)) | ||
| 1212 | return 0; | 1229 | return 0; |
| 1213 | 1230 | ||
| 1214 | current->utask->vaddr = slot_addr; | 1231 | /* Initialize the slot */ |
| 1215 | offset = current->utask->xol_vaddr & ~PAGE_MASK; | 1232 | offset = xol_vaddr & ~PAGE_MASK; |
| 1216 | vaddr = kmap_atomic(area->page); | 1233 | vaddr = kmap_atomic(area->page); |
| 1217 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | 1234 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); |
| 1218 | kunmap_atomic(vaddr); | 1235 | kunmap_atomic(vaddr); |
| @@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot | |||
| 1222 | */ | 1239 | */ |
| 1223 | flush_dcache_page(area->page); | 1240 | flush_dcache_page(area->page); |
| 1224 | 1241 | ||
| 1225 | return current->utask->xol_vaddr; | 1242 | return xol_vaddr; |
| 1226 | } | 1243 | } |
| 1227 | 1244 | ||
| 1228 | /* | 1245 | /* |
| @@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) | |||
| 1240 | return; | 1257 | return; |
| 1241 | 1258 | ||
| 1242 | slot_addr = tsk->utask->xol_vaddr; | 1259 | slot_addr = tsk->utask->xol_vaddr; |
| 1243 | 1260 | if (unlikely(!slot_addr)) | |
| 1244 | if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) | ||
| 1245 | return; | 1261 | return; |
| 1246 | 1262 | ||
| 1247 | area = tsk->mm->uprobes_state.xol_area; | 1263 | area = tsk->mm->uprobes_state.xol_area; |
| @@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t) | |||
| 1303 | } | 1319 | } |
| 1304 | 1320 | ||
| 1305 | /* | 1321 | /* |
| 1306 | * Allocate a uprobe_task object for the task. | 1322 | * Allocate a uprobe_task object for the task if if necessary. |
| 1307 | * Called when the thread hits a breakpoint for the first time. | 1323 | * Called when the thread hits a breakpoint. |
| 1308 | * | 1324 | * |
| 1309 | * Returns: | 1325 | * Returns: |
| 1310 | * - pointer to new uprobe_task on success | 1326 | * - pointer to new uprobe_task on success |
| 1311 | * - NULL otherwise | 1327 | * - NULL otherwise |
| 1312 | */ | 1328 | */ |
| 1313 | static struct uprobe_task *add_utask(void) | 1329 | static struct uprobe_task *get_utask(void) |
| 1314 | { | 1330 | { |
| 1315 | struct uprobe_task *utask; | 1331 | if (!current->utask) |
| 1316 | 1332 | current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); | |
| 1317 | utask = kzalloc(sizeof *utask, GFP_KERNEL); | 1333 | return current->utask; |
| 1318 | if (unlikely(!utask)) | ||
| 1319 | return NULL; | ||
| 1320 | |||
| 1321 | current->utask = utask; | ||
| 1322 | return utask; | ||
| 1323 | } | 1334 | } |
| 1324 | 1335 | ||
| 1325 | /* Prepare to single-step probed instruction out of line. */ | 1336 | /* Prepare to single-step probed instruction out of line. */ |
| 1326 | static int | 1337 | static int |
| 1327 | pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) | 1338 | pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) |
| 1328 | { | 1339 | { |
| 1329 | if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) | 1340 | struct uprobe_task *utask; |
| 1330 | return 0; | 1341 | unsigned long xol_vaddr; |
| 1342 | int err; | ||
| 1343 | |||
| 1344 | utask = get_utask(); | ||
| 1345 | if (!utask) | ||
| 1346 | return -ENOMEM; | ||
| 1347 | |||
| 1348 | xol_vaddr = xol_get_insn_slot(uprobe); | ||
| 1349 | if (!xol_vaddr) | ||
| 1350 | return -ENOMEM; | ||
| 1351 | |||
| 1352 | utask->xol_vaddr = xol_vaddr; | ||
| 1353 | utask->vaddr = bp_vaddr; | ||
| 1354 | |||
| 1355 | err = arch_uprobe_pre_xol(&uprobe->arch, regs); | ||
| 1356 | if (unlikely(err)) { | ||
| 1357 | xol_free_insn_slot(current); | ||
| 1358 | return err; | ||
| 1359 | } | ||
| 1331 | 1360 | ||
| 1332 | return -EFAULT; | 1361 | utask->active_uprobe = uprobe; |
| 1362 | utask->state = UTASK_SSTEP; | ||
| 1363 | return 0; | ||
| 1333 | } | 1364 | } |
| 1334 | 1365 | ||
| 1335 | /* | 1366 | /* |
| @@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) | |||
| 1391 | * This is not strictly accurate, we can race with | 1422 | * This is not strictly accurate, we can race with |
| 1392 | * uprobe_unregister() and see the already removed | 1423 | * uprobe_unregister() and see the already removed |
| 1393 | * uprobe if delete_uprobe() was not yet called. | 1424 | * uprobe if delete_uprobe() was not yet called. |
| 1425 | * Or this uprobe can be filtered out. | ||
| 1394 | */ | 1426 | */ |
| 1395 | if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) | 1427 | if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) |
| 1396 | return; | 1428 | return; |
| @@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | |||
| 1452 | return uprobe; | 1484 | return uprobe; |
| 1453 | } | 1485 | } |
| 1454 | 1486 | ||
| 1487 | static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | ||
| 1488 | { | ||
| 1489 | struct uprobe_consumer *uc; | ||
| 1490 | int remove = UPROBE_HANDLER_REMOVE; | ||
| 1491 | |||
| 1492 | down_read(&uprobe->register_rwsem); | ||
| 1493 | for (uc = uprobe->consumers; uc; uc = uc->next) { | ||
| 1494 | int rc = uc->handler(uc, regs); | ||
| 1495 | |||
| 1496 | WARN(rc & ~UPROBE_HANDLER_MASK, | ||
| 1497 | "bad rc=0x%x from %pf()\n", rc, uc->handler); | ||
| 1498 | remove &= rc; | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | if (remove && uprobe->consumers) { | ||
| 1502 | WARN_ON(!uprobe_is_active(uprobe)); | ||
| 1503 | unapply_uprobe(uprobe, current->mm); | ||
| 1504 | } | ||
| 1505 | up_read(&uprobe->register_rwsem); | ||
| 1506 | } | ||
| 1507 | |||
| 1455 | /* | 1508 | /* |
| 1456 | * Run handler and ask thread to singlestep. | 1509 | * Run handler and ask thread to singlestep. |
| 1457 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1510 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
| 1458 | */ | 1511 | */ |
| 1459 | static void handle_swbp(struct pt_regs *regs) | 1512 | static void handle_swbp(struct pt_regs *regs) |
| 1460 | { | 1513 | { |
| 1461 | struct uprobe_task *utask; | ||
| 1462 | struct uprobe *uprobe; | 1514 | struct uprobe *uprobe; |
| 1463 | unsigned long bp_vaddr; | 1515 | unsigned long bp_vaddr; |
| 1464 | int uninitialized_var(is_swbp); | 1516 | int uninitialized_var(is_swbp); |
| @@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs) | |||
| 1483 | } | 1535 | } |
| 1484 | return; | 1536 | return; |
| 1485 | } | 1537 | } |
| 1538 | |||
| 1539 | /* change it in advance for ->handler() and restart */ | ||
| 1540 | instruction_pointer_set(regs, bp_vaddr); | ||
| 1541 | |||
| 1486 | /* | 1542 | /* |
| 1487 | * TODO: move copy_insn/etc into _register and remove this hack. | 1543 | * TODO: move copy_insn/etc into _register and remove this hack. |
| 1488 | * After we hit the bp, _unregister + _register can install the | 1544 | * After we hit the bp, _unregister + _register can install the |
| @@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs) | |||
| 1490 | */ | 1546 | */ |
| 1491 | smp_rmb(); /* pairs with wmb() in install_breakpoint() */ | 1547 | smp_rmb(); /* pairs with wmb() in install_breakpoint() */ |
| 1492 | if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) | 1548 | if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) |
| 1493 | goto restart; | 1549 | goto out; |
| 1494 | |||
| 1495 | utask = current->utask; | ||
| 1496 | if (!utask) { | ||
| 1497 | utask = add_utask(); | ||
| 1498 | /* Cannot allocate; re-execute the instruction. */ | ||
| 1499 | if (!utask) | ||
| 1500 | goto restart; | ||
| 1501 | } | ||
| 1502 | 1550 | ||
| 1503 | handler_chain(uprobe, regs); | 1551 | handler_chain(uprobe, regs); |
| 1504 | if (can_skip_sstep(uprobe, regs)) | 1552 | if (can_skip_sstep(uprobe, regs)) |
| 1505 | goto out; | 1553 | goto out; |
| 1506 | 1554 | ||
| 1507 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | 1555 | if (!pre_ssout(uprobe, regs, bp_vaddr)) |
| 1508 | utask->active_uprobe = uprobe; | ||
| 1509 | utask->state = UTASK_SSTEP; | ||
| 1510 | return; | 1556 | return; |
| 1511 | } | ||
| 1512 | 1557 | ||
| 1513 | restart: | 1558 | /* can_skip_sstep() succeeded, or restart if can't singlestep */ |
| 1514 | /* | ||
| 1515 | * cannot singlestep; cannot skip instruction; | ||
| 1516 | * re-execute the instruction. | ||
| 1517 | */ | ||
| 1518 | instruction_pointer_set(regs, bp_vaddr); | ||
| 1519 | out: | 1559 | out: |
| 1520 | put_uprobe(uprobe); | 1560 | put_uprobe(uprobe); |
| 1521 | } | 1561 | } |
| @@ -1609,10 +1649,8 @@ static int __init init_uprobes(void) | |||
| 1609 | { | 1649 | { |
| 1610 | int i; | 1650 | int i; |
| 1611 | 1651 | ||
| 1612 | for (i = 0; i < UPROBES_HASH_SZ; i++) { | 1652 | for (i = 0; i < UPROBES_HASH_SZ; i++) |
| 1613 | mutex_init(&uprobes_mutex[i]); | ||
| 1614 | mutex_init(&uprobes_mmap_mutex[i]); | 1653 | mutex_init(&uprobes_mmap_mutex[i]); |
| 1615 | } | ||
| 1616 | 1654 | ||
| 1617 | if (percpu_init_rwsem(&dup_mmap_sem)) | 1655 | if (percpu_init_rwsem(&dup_mmap_sem)) |
| 1618 | return -ENOMEM; | 1656 | return -ENOMEM; |
diff --git a/kernel/exit.c b/kernel/exit.c index b4df21937216..51e485ca9935 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/tsacct_kern.h> | 20 | #include <linux/tsacct_kern.h> |
| 21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
| 22 | #include <linux/fdtable.h> | 22 | #include <linux/fdtable.h> |
| 23 | #include <linux/freezer.h> | ||
| 23 | #include <linux/binfmts.h> | 24 | #include <linux/binfmts.h> |
| 24 | #include <linux/nsproxy.h> | 25 | #include <linux/nsproxy.h> |
| 25 | #include <linux/pid_namespace.h> | 26 | #include <linux/pid_namespace.h> |
| @@ -31,7 +32,6 @@ | |||
| 31 | #include <linux/mempolicy.h> | 32 | #include <linux/mempolicy.h> |
| 32 | #include <linux/taskstats_kern.h> | 33 | #include <linux/taskstats_kern.h> |
| 33 | #include <linux/delayacct.h> | 34 | #include <linux/delayacct.h> |
| 34 | #include <linux/freezer.h> | ||
| 35 | #include <linux/cgroup.h> | 35 | #include <linux/cgroup.h> |
| 36 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
| 37 | #include <linux/signal.h> | 37 | #include <linux/signal.h> |
| @@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 85 | bool group_dead = thread_group_leader(tsk); | 85 | bool group_dead = thread_group_leader(tsk); |
| 86 | struct sighand_struct *sighand; | 86 | struct sighand_struct *sighand; |
| 87 | struct tty_struct *uninitialized_var(tty); | 87 | struct tty_struct *uninitialized_var(tty); |
| 88 | cputime_t utime, stime; | ||
| 88 | 89 | ||
| 89 | sighand = rcu_dereference_check(tsk->sighand, | 90 | sighand = rcu_dereference_check(tsk->sighand, |
| 90 | lockdep_tasklist_lock_is_held()); | 91 | lockdep_tasklist_lock_is_held()); |
| @@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 123 | * We won't ever get here for the group leader, since it | 124 | * We won't ever get here for the group leader, since it |
| 124 | * will have been the last reference on the signal_struct. | 125 | * will have been the last reference on the signal_struct. |
| 125 | */ | 126 | */ |
| 126 | sig->utime += tsk->utime; | 127 | task_cputime(tsk, &utime, &stime); |
| 127 | sig->stime += tsk->stime; | 128 | sig->utime += utime; |
| 128 | sig->gtime += tsk->gtime; | 129 | sig->stime += stime; |
| 130 | sig->gtime += task_gtime(tsk); | ||
| 129 | sig->min_flt += tsk->min_flt; | 131 | sig->min_flt += tsk->min_flt; |
| 130 | sig->maj_flt += tsk->maj_flt; | 132 | sig->maj_flt += tsk->maj_flt; |
| 131 | sig->nvcsw += tsk->nvcsw; | 133 | sig->nvcsw += tsk->nvcsw; |
| @@ -483,7 +485,7 @@ static void exit_mm(struct task_struct * tsk) | |||
| 483 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 485 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
| 484 | if (!self.task) /* see coredump_finish() */ | 486 | if (!self.task) /* see coredump_finish() */ |
| 485 | break; | 487 | break; |
| 486 | schedule(); | 488 | freezable_schedule(); |
| 487 | } | 489 | } |
| 488 | __set_task_state(tsk, TASK_RUNNING); | 490 | __set_task_state(tsk, TASK_RUNNING); |
| 489 | down_read(&mm->mmap_sem); | 491 | down_read(&mm->mmap_sem); |
| @@ -833,7 +835,7 @@ void do_exit(long code) | |||
| 833 | /* | 835 | /* |
| 834 | * Make sure we are holding no locks: | 836 | * Make sure we are holding no locks: |
| 835 | */ | 837 | */ |
| 836 | debug_check_no_locks_held(tsk); | 838 | debug_check_no_locks_held(); |
| 837 | /* | 839 | /* |
| 838 | * We can do this unlocked here. The futex code uses this flag | 840 | * We can do this unlocked here. The futex code uses this flag |
| 839 | * just to verify whether the pi state cleanup has been done | 841 | * just to verify whether the pi state cleanup has been done |
| @@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1092 | sig = p->signal; | 1094 | sig = p->signal; |
| 1093 | psig->cutime += tgutime + sig->cutime; | 1095 | psig->cutime += tgutime + sig->cutime; |
| 1094 | psig->cstime += tgstime + sig->cstime; | 1096 | psig->cstime += tgstime + sig->cstime; |
| 1095 | psig->cgtime += p->gtime + sig->gtime + sig->cgtime; | 1097 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; |
| 1096 | psig->cmin_flt += | 1098 | psig->cmin_flt += |
| 1097 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1099 | p->min_flt + sig->min_flt + sig->cmin_flt; |
| 1098 | psig->cmaj_flt += | 1100 | psig->cmaj_flt += |
diff --git a/kernel/fork.c b/kernel/fork.c index 65ca6d27f24e..8d932b1c9056 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 413 | tmp->vm_next = tmp->vm_prev = NULL; | 413 | tmp->vm_next = tmp->vm_prev = NULL; |
| 414 | file = tmp->vm_file; | 414 | file = tmp->vm_file; |
| 415 | if (file) { | 415 | if (file) { |
| 416 | struct inode *inode = file->f_path.dentry->d_inode; | 416 | struct inode *inode = file_inode(file); |
| 417 | struct address_space *mapping = file->f_mapping; | 417 | struct address_space *mapping = file->f_mapping; |
| 418 | 418 | ||
| 419 | get_file(file); | 419 | get_file(file); |
| @@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1233 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1233 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
| 1234 | p->prev_cputime.utime = p->prev_cputime.stime = 0; | 1234 | p->prev_cputime.utime = p->prev_cputime.stime = 0; |
| 1235 | #endif | 1235 | #endif |
| 1236 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | ||
| 1237 | seqlock_init(&p->vtime_seqlock); | ||
| 1238 | p->vtime_snap = 0; | ||
| 1239 | p->vtime_snap_whence = VTIME_SLEEPING; | ||
| 1240 | #endif | ||
| 1241 | |||
| 1236 | #if defined(SPLIT_RSS_COUNTING) | 1242 | #if defined(SPLIT_RSS_COUNTING) |
| 1237 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1243 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
| 1238 | #endif | 1244 | #endif |
| @@ -1668,8 +1674,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | |||
| 1668 | int, tls_val) | 1674 | int, tls_val) |
| 1669 | #endif | 1675 | #endif |
| 1670 | { | 1676 | { |
| 1671 | return do_fork(clone_flags, newsp, 0, | 1677 | long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); |
| 1672 | parent_tidptr, child_tidptr); | 1678 | asmlinkage_protect(5, ret, clone_flags, newsp, |
| 1679 | parent_tidptr, child_tidptr, tls_val); | ||
| 1680 | return ret; | ||
| 1673 | } | 1681 | } |
| 1674 | #endif | 1682 | #endif |
| 1675 | 1683 | ||
| @@ -1853,10 +1861,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
| 1853 | exit_sem(current); | 1861 | exit_sem(current); |
| 1854 | } | 1862 | } |
| 1855 | 1863 | ||
| 1856 | if (new_nsproxy) { | 1864 | if (new_nsproxy) |
| 1857 | switch_task_namespaces(current, new_nsproxy); | 1865 | switch_task_namespaces(current, new_nsproxy); |
| 1858 | new_nsproxy = NULL; | ||
| 1859 | } | ||
| 1860 | 1866 | ||
| 1861 | task_lock(current); | 1867 | task_lock(current); |
| 1862 | 1868 | ||
| @@ -1886,9 +1892,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
| 1886 | } | 1892 | } |
| 1887 | } | 1893 | } |
| 1888 | 1894 | ||
| 1889 | if (new_nsproxy) | ||
| 1890 | put_nsproxy(new_nsproxy); | ||
| 1891 | |||
| 1892 | bad_unshare_cleanup_cred: | 1895 | bad_unshare_cleanup_cred: |
| 1893 | if (new_cred) | 1896 | if (new_cred) |
| 1894 | put_cred(new_cred); | 1897 | put_cred(new_cred); |
diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca003..fbc07a29ec53 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -60,6 +60,7 @@ | |||
| 60 | #include <linux/pid.h> | 60 | #include <linux/pid.h> |
| 61 | #include <linux/nsproxy.h> | 61 | #include <linux/nsproxy.h> |
| 62 | #include <linux/ptrace.h> | 62 | #include <linux/ptrace.h> |
| 63 | #include <linux/sched/rt.h> | ||
| 63 | 64 | ||
| 64 | #include <asm/futex.h> | 65 | #include <asm/futex.h> |
| 65 | 66 | ||
| @@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, | |||
| 2471 | if (!futex_cmpxchg_enabled) | 2472 | if (!futex_cmpxchg_enabled) |
| 2472 | return -ENOSYS; | 2473 | return -ENOSYS; |
| 2473 | 2474 | ||
| 2474 | WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); | ||
| 2475 | |||
| 2476 | rcu_read_lock(); | 2475 | rcu_read_lock(); |
| 2477 | 2476 | ||
| 2478 | ret = -ESRCH; | 2477 | ret = -ESRCH; |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005fc..f9f44fd4d34d 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/nsproxy.h> | 11 | #include <linux/nsproxy.h> |
| 12 | #include <linux/futex.h> | 12 | #include <linux/futex.h> |
| 13 | #include <linux/ptrace.h> | 13 | #include <linux/ptrace.h> |
| 14 | #include <linux/syscalls.h> | ||
| 14 | 15 | ||
| 15 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
| 16 | 17 | ||
| @@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
| 116 | } | 117 | } |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 119 | asmlinkage long | 120 | COMPAT_SYSCALL_DEFINE2(set_robust_list, |
| 120 | compat_sys_set_robust_list(struct compat_robust_list_head __user *head, | 121 | struct compat_robust_list_head __user *, head, |
| 121 | compat_size_t len) | 122 | compat_size_t, len) |
| 122 | { | 123 | { |
| 123 | if (!futex_cmpxchg_enabled) | 124 | if (!futex_cmpxchg_enabled) |
| 124 | return -ENOSYS; | 125 | return -ENOSYS; |
| @@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, | |||
| 131 | return 0; | 132 | return 0; |
| 132 | } | 133 | } |
| 133 | 134 | ||
| 134 | asmlinkage long | 135 | COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, |
| 135 | compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | 136 | compat_uptr_t __user *, head_ptr, |
| 136 | compat_size_t __user *len_ptr) | 137 | compat_size_t __user *, len_ptr) |
| 137 | { | 138 | { |
| 138 | struct compat_robust_list_head __user *head; | 139 | struct compat_robust_list_head __user *head; |
| 139 | unsigned long ret; | 140 | unsigned long ret; |
| @@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
| 142 | if (!futex_cmpxchg_enabled) | 143 | if (!futex_cmpxchg_enabled) |
| 143 | return -ENOSYS; | 144 | return -ENOSYS; |
| 144 | 145 | ||
| 145 | WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); | ||
| 146 | |||
| 147 | rcu_read_lock(); | 146 | rcu_read_lock(); |
| 148 | 147 | ||
| 149 | ret = -ESRCH; | 148 | ret = -ESRCH; |
| @@ -172,9 +171,9 @@ err_unlock: | |||
| 172 | return ret; | 171 | return ret; |
| 173 | } | 172 | } |
| 174 | 173 | ||
| 175 | asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | 174 | COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, |
| 176 | struct compat_timespec __user *utime, u32 __user *uaddr2, | 175 | struct compat_timespec __user *, utime, u32 __user *, uaddr2, |
| 177 | u32 val3) | 176 | u32, val3) |
| 178 | { | 177 | { |
| 179 | struct timespec ts; | 178 | struct timespec ts; |
| 180 | ktime_t t, *tp = NULL; | 179 | ktime_t t, *tp = NULL; |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc1..d4da55d1fb65 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -35,7 +35,7 @@ config GCOV_KERNEL | |||
| 35 | config GCOV_PROFILE_ALL | 35 | config GCOV_PROFILE_ALL |
| 36 | bool "Profile entire Kernel" | 36 | bool "Profile entire Kernel" |
| 37 | depends on GCOV_KERNEL | 37 | depends on GCOV_KERNEL |
| 38 | depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE | 38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE |
| 39 | default n | 39 | default n |
| 40 | ---help--- | 40 | ---help--- |
| 41 | This options activates profiling for the entire kernel. | 41 | This options activates profiling for the entire kernel. |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b5..cc47812d3feb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -44,6 +44,8 @@ | |||
| 44 | #include <linux/err.h> | 44 | #include <linux/err.h> |
| 45 | #include <linux/debugobjects.h> | 45 | #include <linux/debugobjects.h> |
| 46 | #include <linux/sched.h> | 46 | #include <linux/sched.h> |
| 47 | #include <linux/sched/sysctl.h> | ||
| 48 | #include <linux/sched/rt.h> | ||
| 47 | #include <linux/timer.h> | 49 | #include <linux/timer.h> |
| 48 | 50 | ||
| 49 | #include <asm/uaccess.h> | 51 | #include <asm/uaccess.h> |
| @@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
| 640 | * and expiry check is done in the hrtimer_interrupt or in the softirq. | 642 | * and expiry check is done in the hrtimer_interrupt or in the softirq. |
| 641 | */ | 643 | */ |
| 642 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | 644 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, |
| 643 | struct hrtimer_clock_base *base, | 645 | struct hrtimer_clock_base *base) |
| 644 | int wakeup) | ||
| 645 | { | 646 | { |
| 646 | if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { | 647 | return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); |
| 647 | if (wakeup) { | ||
| 648 | raw_spin_unlock(&base->cpu_base->lock); | ||
| 649 | raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
| 650 | raw_spin_lock(&base->cpu_base->lock); | ||
| 651 | } else | ||
| 652 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
| 653 | |||
| 654 | return 1; | ||
| 655 | } | ||
| 656 | |||
| 657 | return 0; | ||
| 658 | } | 648 | } |
| 659 | 649 | ||
| 660 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | 650 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) |
| @@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; } | |||
| 735 | static inline void | 725 | static inline void |
| 736 | hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } | 726 | hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } |
| 737 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | 727 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, |
| 738 | struct hrtimer_clock_base *base, | 728 | struct hrtimer_clock_base *base) |
| 739 | int wakeup) | ||
| 740 | { | 729 | { |
| 741 | return 0; | 730 | return 0; |
| 742 | } | 731 | } |
| @@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
| 995 | * | 984 | * |
| 996 | * XXX send_remote_softirq() ? | 985 | * XXX send_remote_softirq() ? |
| 997 | */ | 986 | */ |
| 998 | if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) | 987 | if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) |
| 999 | hrtimer_enqueue_reprogram(timer, new_base, wakeup); | 988 | && hrtimer_enqueue_reprogram(timer, new_base)) { |
| 989 | if (wakeup) { | ||
| 990 | /* | ||
| 991 | * We need to drop cpu_base->lock to avoid a | ||
| 992 | * lock ordering issue vs. rq->lock. | ||
| 993 | */ | ||
| 994 | raw_spin_unlock(&new_base->cpu_base->lock); | ||
| 995 | raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
| 996 | local_irq_restore(flags); | ||
| 997 | return ret; | ||
| 998 | } else { | ||
| 999 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
| 1000 | } | ||
| 1001 | } | ||
| 1000 | 1002 | ||
| 1001 | unlock_hrtimer_base(timer, &flags); | 1003 | unlock_hrtimer_base(timer, &flags); |
| 1002 | 1004 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3aca9f29d30e..cbd97ce0b000 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data) | |||
| 90 | EXPORT_SYMBOL(irq_set_handler_data); | 90 | EXPORT_SYMBOL(irq_set_handler_data); |
| 91 | 91 | ||
| 92 | /** | 92 | /** |
| 93 | * irq_set_msi_desc - set MSI descriptor data for an irq | 93 | * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset |
| 94 | * @irq: Interrupt number | 94 | * @irq_base: Interrupt number base |
| 95 | * @entry: Pointer to MSI descriptor data | 95 | * @irq_offset: Interrupt number offset |
| 96 | * @entry: Pointer to MSI descriptor data | ||
| 96 | * | 97 | * |
| 97 | * Set the MSI descriptor entry for an irq | 98 | * Set the MSI descriptor entry for an irq at offset |
| 98 | */ | 99 | */ |
| 99 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) | 100 | int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, |
| 101 | struct msi_desc *entry) | ||
| 100 | { | 102 | { |
| 101 | unsigned long flags; | 103 | unsigned long flags; |
| 102 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); | 104 | struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
| 103 | 105 | ||
| 104 | if (!desc) | 106 | if (!desc) |
| 105 | return -EINVAL; | 107 | return -EINVAL; |
| 106 | desc->irq_data.msi_desc = entry; | 108 | desc->irq_data.msi_desc = entry; |
| 107 | if (entry) | 109 | if (entry && !irq_offset) |
| 108 | entry->irq = irq; | 110 | entry->irq = irq_base; |
| 109 | irq_put_desc_unlock(desc, flags); | 111 | irq_put_desc_unlock(desc, flags); |
| 110 | return 0; | 112 | return 0; |
| 111 | } | 113 | } |
| 112 | 114 | ||
| 113 | /** | 115 | /** |
| 116 | * irq_set_msi_desc - set MSI descriptor data for an irq | ||
| 117 | * @irq: Interrupt number | ||
| 118 | * @entry: Pointer to MSI descriptor data | ||
| 119 | * | ||
| 120 | * Set the MSI descriptor entry for an irq | ||
| 121 | */ | ||
| 122 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) | ||
| 123 | { | ||
| 124 | return irq_set_msi_desc_off(irq, 0, entry); | ||
| 125 | } | ||
| 126 | |||
| 127 | /** | ||
| 114 | * irq_set_chip_data - set irq chip data for an irq | 128 | * irq_set_chip_data - set irq chip data for an irq |
| 115 | * @irq: Interrupt number | 129 | * @irq: Interrupt number |
| 116 | * @data: Pointer to chip specific data | 130 | * @data: Pointer to chip specific data |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e49a288fa479..fa17855ca65a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
| 17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
| 18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
| 19 | #include <linux/sched/rt.h> | ||
| 19 | #include <linux/task_work.h> | 20 | #include <linux/task_work.h> |
| 20 | 21 | ||
| 21 | #include "internals.h" | 22 | #include "internals.h" |
| @@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) | |||
| 1524 | out: | 1525 | out: |
| 1525 | irq_put_desc_unlock(desc, flags); | 1526 | irq_put_desc_unlock(desc, flags); |
| 1526 | } | 1527 | } |
| 1528 | EXPORT_SYMBOL_GPL(enable_percpu_irq); | ||
| 1527 | 1529 | ||
| 1528 | void disable_percpu_irq(unsigned int irq) | 1530 | void disable_percpu_irq(unsigned int irq) |
| 1529 | { | 1531 | { |
| @@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq) | |||
| 1537 | irq_percpu_disable(desc, cpu); | 1539 | irq_percpu_disable(desc, cpu); |
| 1538 | irq_put_desc_unlock(desc, flags); | 1540 | irq_put_desc_unlock(desc, flags); |
| 1539 | } | 1541 | } |
| 1542 | EXPORT_SYMBOL_GPL(disable_percpu_irq); | ||
| 1540 | 1543 | ||
| 1541 | /* | 1544 | /* |
| 1542 | * Internal function to unregister a percpu irqaction. | 1545 | * Internal function to unregister a percpu irqaction. |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4bd4faa6323a..397db02209ed 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) | |||
| 76 | static ssize_t write_irq_affinity(int type, struct file *file, | 76 | static ssize_t write_irq_affinity(int type, struct file *file, |
| 77 | const char __user *buffer, size_t count, loff_t *pos) | 77 | const char __user *buffer, size_t count, loff_t *pos) |
| 78 | { | 78 | { |
| 79 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; | 79 | unsigned int irq = (int)(long)PDE(file_inode(file))->data; |
| 80 | cpumask_var_t new_value; | 80 | cpumask_var_t new_value; |
| 81 | int err; | 81 | int err; |
| 82 | 82 | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 611cd6003c45..7b5f012bde9d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) | |||
| 80 | 80 | ||
| 81 | /* | 81 | /* |
| 82 | * All handlers must agree on IRQF_SHARED, so we test just the | 82 | * All handlers must agree on IRQF_SHARED, so we test just the |
| 83 | * first. Check for action->next as well. | 83 | * first. |
| 84 | */ | 84 | */ |
| 85 | action = desc->action; | 85 | action = desc->action; |
| 86 | if (!action || !(action->flags & IRQF_SHARED) || | 86 | if (!action || !(action->flags & IRQF_SHARED) || |
| 87 | (action->flags & __IRQF_TIMER) || | 87 | (action->flags & __IRQF_TIMER)) |
| 88 | (action->handler(irq, action->dev_id) == IRQ_HANDLED) || | ||
| 89 | !action->next) | ||
| 90 | goto out; | 88 | goto out; |
| 91 | 89 | ||
| 92 | /* Already running on another processor */ | 90 | /* Already running on another processor */ |
| @@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) | |||
| 104 | do { | 102 | do { |
| 105 | if (handle_irq_event(desc) == IRQ_HANDLED) | 103 | if (handle_irq_event(desc) == IRQ_HANDLED) |
| 106 | ret = IRQ_HANDLED; | 104 | ret = IRQ_HANDLED; |
| 105 | /* Make sure that there is still a valid action */ | ||
| 107 | action = desc->action; | 106 | action = desc->action; |
| 108 | } while ((desc->istate & IRQS_PENDING) && action); | 107 | } while ((desc->istate & IRQS_PENDING) && action); |
| 109 | desc->istate &= ~IRQS_POLL_INPROGRESS; | 108 | desc->istate &= ~IRQS_POLL_INPROGRESS; |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 1588e3b2871b..55fcce6065cf 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -12,37 +12,36 @@ | |||
| 12 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
| 13 | #include <linux/hardirq.h> | 13 | #include <linux/hardirq.h> |
| 14 | #include <linux/irqflags.h> | 14 | #include <linux/irqflags.h> |
| 15 | #include <linux/sched.h> | ||
| 16 | #include <linux/tick.h> | ||
| 17 | #include <linux/cpu.h> | ||
| 18 | #include <linux/notifier.h> | ||
| 15 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
| 16 | 20 | ||
| 17 | /* | ||
| 18 | * An entry can be in one of four states: | ||
| 19 | * | ||
| 20 | * free NULL, 0 -> {claimed} : free to be used | ||
| 21 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | ||
| 22 | * pending next, 3 -> {busy} : queued, pending callback | ||
| 23 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | ||
| 24 | */ | ||
| 25 | |||
| 26 | #define IRQ_WORK_PENDING 1UL | ||
| 27 | #define IRQ_WORK_BUSY 2UL | ||
| 28 | #define IRQ_WORK_FLAGS 3UL | ||
| 29 | 21 | ||
| 30 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); | 22 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); |
| 23 | static DEFINE_PER_CPU(int, irq_work_raised); | ||
| 31 | 24 | ||
| 32 | /* | 25 | /* |
| 33 | * Claim the entry so that no one else will poke at it. | 26 | * Claim the entry so that no one else will poke at it. |
| 34 | */ | 27 | */ |
| 35 | static bool irq_work_claim(struct irq_work *work) | 28 | static bool irq_work_claim(struct irq_work *work) |
| 36 | { | 29 | { |
| 37 | unsigned long flags, nflags; | 30 | unsigned long flags, oflags, nflags; |
| 38 | 31 | ||
| 32 | /* | ||
| 33 | * Start with our best wish as a premise but only trust any | ||
| 34 | * flag value after cmpxchg() result. | ||
| 35 | */ | ||
| 36 | flags = work->flags & ~IRQ_WORK_PENDING; | ||
| 39 | for (;;) { | 37 | for (;;) { |
| 40 | flags = work->flags; | ||
| 41 | if (flags & IRQ_WORK_PENDING) | ||
| 42 | return false; | ||
| 43 | nflags = flags | IRQ_WORK_FLAGS; | 38 | nflags = flags | IRQ_WORK_FLAGS; |
| 44 | if (cmpxchg(&work->flags, flags, nflags) == flags) | 39 | oflags = cmpxchg(&work->flags, flags, nflags); |
| 40 | if (oflags == flags) | ||
| 45 | break; | 41 | break; |
| 42 | if (oflags & IRQ_WORK_PENDING) | ||
| 43 | return false; | ||
| 44 | flags = oflags; | ||
| 46 | cpu_relax(); | 45 | cpu_relax(); |
| 47 | } | 46 | } |
| 48 | 47 | ||
| @@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void) | |||
| 57 | } | 56 | } |
| 58 | 57 | ||
| 59 | /* | 58 | /* |
| 60 | * Queue the entry and raise the IPI if needed. | 59 | * Enqueue the irq_work @entry unless it's already pending |
| 60 | * somewhere. | ||
| 61 | * | ||
| 62 | * Can be re-enqueued while the callback is still in progress. | ||
| 61 | */ | 63 | */ |
| 62 | static void __irq_work_queue(struct irq_work *work) | 64 | void irq_work_queue(struct irq_work *work) |
| 63 | { | 65 | { |
| 64 | bool empty; | 66 | /* Only queue if not already pending */ |
| 67 | if (!irq_work_claim(work)) | ||
| 68 | return; | ||
| 65 | 69 | ||
| 70 | /* Queue the entry and raise the IPI if needed. */ | ||
| 66 | preempt_disable(); | 71 | preempt_disable(); |
| 67 | 72 | ||
| 68 | empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); | 73 | llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); |
| 69 | /* The list was empty, raise self-interrupt to start processing. */ | 74 | |
| 70 | if (empty) | 75 | /* |
| 71 | arch_irq_work_raise(); | 76 | * If the work is not "lazy" or the tick is stopped, raise the irq |
| 77 | * work interrupt (if supported by the arch), otherwise, just wait | ||
| 78 | * for the next tick. | ||
| 79 | */ | ||
| 80 | if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { | ||
| 81 | if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) | ||
| 82 | arch_irq_work_raise(); | ||
| 83 | } | ||
| 72 | 84 | ||
| 73 | preempt_enable(); | 85 | preempt_enable(); |
| 74 | } | 86 | } |
| 87 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
| 75 | 88 | ||
| 76 | /* | 89 | bool irq_work_needs_cpu(void) |
| 77 | * Enqueue the irq_work @entry, returns true on success, failure when the | ||
| 78 | * @entry was already enqueued by someone else. | ||
| 79 | * | ||
| 80 | * Can be re-enqueued while the callback is still in progress. | ||
| 81 | */ | ||
| 82 | bool irq_work_queue(struct irq_work *work) | ||
| 83 | { | 90 | { |
| 84 | if (!irq_work_claim(work)) { | 91 | struct llist_head *this_list; |
| 85 | /* | 92 | |
| 86 | * Already enqueued, can't do! | 93 | this_list = &__get_cpu_var(irq_work_list); |
| 87 | */ | 94 | if (llist_empty(this_list)) |
| 88 | return false; | 95 | return false; |
| 89 | } | ||
| 90 | 96 | ||
| 91 | __irq_work_queue(work); | 97 | /* All work should have been flushed before going offline */ |
| 98 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | ||
| 99 | |||
| 92 | return true; | 100 | return true; |
| 93 | } | 101 | } |
| 94 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
| 95 | 102 | ||
| 96 | /* | 103 | static void __irq_work_run(void) |
| 97 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | ||
| 98 | * context with local IRQs disabled. | ||
| 99 | */ | ||
| 100 | void irq_work_run(void) | ||
| 101 | { | 104 | { |
| 105 | unsigned long flags; | ||
| 102 | struct irq_work *work; | 106 | struct irq_work *work; |
| 103 | struct llist_head *this_list; | 107 | struct llist_head *this_list; |
| 104 | struct llist_node *llnode; | 108 | struct llist_node *llnode; |
| 105 | 109 | ||
| 110 | |||
| 111 | /* | ||
| 112 | * Reset the "raised" state right before we check the list because | ||
| 113 | * an NMI may enqueue after we find the list empty from the runner. | ||
| 114 | */ | ||
| 115 | __this_cpu_write(irq_work_raised, 0); | ||
| 116 | barrier(); | ||
| 117 | |||
| 106 | this_list = &__get_cpu_var(irq_work_list); | 118 | this_list = &__get_cpu_var(irq_work_list); |
| 107 | if (llist_empty(this_list)) | 119 | if (llist_empty(this_list)) |
| 108 | return; | 120 | return; |
| 109 | 121 | ||
| 110 | BUG_ON(!in_irq()); | ||
| 111 | BUG_ON(!irqs_disabled()); | 122 | BUG_ON(!irqs_disabled()); |
| 112 | 123 | ||
| 113 | llnode = llist_del_all(this_list); | 124 | llnode = llist_del_all(this_list); |
| @@ -119,16 +130,31 @@ void irq_work_run(void) | |||
| 119 | /* | 130 | /* |
| 120 | * Clear the PENDING bit, after this point the @work | 131 | * Clear the PENDING bit, after this point the @work |
| 121 | * can be re-used. | 132 | * can be re-used. |
| 133 | * Make it immediately visible so that other CPUs trying | ||
| 134 | * to claim that work don't rely on us to handle their data | ||
| 135 | * while we are in the middle of the func. | ||
| 122 | */ | 136 | */ |
| 123 | work->flags = IRQ_WORK_BUSY; | 137 | flags = work->flags & ~IRQ_WORK_PENDING; |
| 138 | xchg(&work->flags, flags); | ||
| 139 | |||
| 124 | work->func(work); | 140 | work->func(work); |
| 125 | /* | 141 | /* |
| 126 | * Clear the BUSY bit and return to the free state if | 142 | * Clear the BUSY bit and return to the free state if |
| 127 | * no-one else claimed it meanwhile. | 143 | * no-one else claimed it meanwhile. |
| 128 | */ | 144 | */ |
| 129 | (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); | 145 | (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); |
| 130 | } | 146 | } |
| 131 | } | 147 | } |
| 148 | |||
| 149 | /* | ||
| 150 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | ||
| 151 | * context with local IRQs disabled. | ||
| 152 | */ | ||
| 153 | void irq_work_run(void) | ||
| 154 | { | ||
| 155 | BUG_ON(!in_irq()); | ||
| 156 | __irq_work_run(); | ||
| 157 | } | ||
| 132 | EXPORT_SYMBOL_GPL(irq_work_run); | 158 | EXPORT_SYMBOL_GPL(irq_work_run); |
| 133 | 159 | ||
| 134 | /* | 160 | /* |
| @@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work) | |||
| 143 | cpu_relax(); | 169 | cpu_relax(); |
| 144 | } | 170 | } |
| 145 | EXPORT_SYMBOL_GPL(irq_work_sync); | 171 | EXPORT_SYMBOL_GPL(irq_work_sync); |
| 172 | |||
| 173 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 174 | static int irq_work_cpu_notify(struct notifier_block *self, | ||
| 175 | unsigned long action, void *hcpu) | ||
| 176 | { | ||
| 177 | long cpu = (long)hcpu; | ||
| 178 | |||
| 179 | switch (action) { | ||
| 180 | case CPU_DYING: | ||
| 181 | /* Called from stop_machine */ | ||
| 182 | if (WARN_ON_ONCE(cpu != smp_processor_id())) | ||
| 183 | break; | ||
| 184 | __irq_work_run(); | ||
| 185 | break; | ||
| 186 | default: | ||
| 187 | break; | ||
| 188 | } | ||
| 189 | return NOTIFY_OK; | ||
| 190 | } | ||
| 191 | |||
| 192 | static struct notifier_block cpu_notify; | ||
| 193 | |||
| 194 | static __init int irq_work_init_cpu_notifier(void) | ||
| 195 | { | ||
| 196 | cpu_notify.notifier_call = irq_work_cpu_notify; | ||
| 197 | cpu_notify.priority = 0; | ||
| 198 | register_cpu_notifier(&cpu_notify); | ||
| 199 | return 0; | ||
| 200 | } | ||
| 201 | device_initcall(irq_work_init_cpu_notifier); | ||
| 202 | |||
| 203 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5d..bddd3d7a74b6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -54,6 +54,12 @@ struct resource crashk_res = { | |||
| 54 | .end = 0, | 54 | .end = 0, |
| 55 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | 55 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM |
| 56 | }; | 56 | }; |
| 57 | struct resource crashk_low_res = { | ||
| 58 | .name = "Crash kernel low", | ||
| 59 | .start = 0, | ||
| 60 | .end = 0, | ||
| 61 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
| 62 | }; | ||
| 57 | 63 | ||
| 58 | int kexec_should_crash(struct task_struct *p) | 64 | int kexec_should_crash(struct task_struct *p) |
| 59 | { | 65 | { |
| @@ -223,6 +229,8 @@ out: | |||
| 223 | 229 | ||
| 224 | } | 230 | } |
| 225 | 231 | ||
| 232 | static void kimage_free_page_list(struct list_head *list); | ||
| 233 | |||
| 226 | static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | 234 | static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, |
| 227 | unsigned long nr_segments, | 235 | unsigned long nr_segments, |
| 228 | struct kexec_segment __user *segments) | 236 | struct kexec_segment __user *segments) |
| @@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | |||
| 236 | if (result) | 244 | if (result) |
| 237 | goto out; | 245 | goto out; |
| 238 | 246 | ||
| 239 | *rimage = image; | ||
| 240 | |||
| 241 | /* | 247 | /* |
| 242 | * Find a location for the control code buffer, and add it | 248 | * Find a location for the control code buffer, and add it |
| 243 | * the vector of segments so that it's pages will also be | 249 | * the vector of segments so that it's pages will also be |
| @@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | |||
| 248 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 254 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
| 249 | if (!image->control_code_page) { | 255 | if (!image->control_code_page) { |
| 250 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | 256 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); |
| 251 | goto out; | 257 | goto out_free; |
| 252 | } | 258 | } |
| 253 | 259 | ||
| 254 | image->swap_page = kimage_alloc_control_pages(image, 0); | 260 | image->swap_page = kimage_alloc_control_pages(image, 0); |
| 255 | if (!image->swap_page) { | 261 | if (!image->swap_page) { |
| 256 | printk(KERN_ERR "Could not allocate swap buffer\n"); | 262 | printk(KERN_ERR "Could not allocate swap buffer\n"); |
| 257 | goto out; | 263 | goto out_free; |
| 258 | } | 264 | } |
| 259 | 265 | ||
| 260 | result = 0; | 266 | *rimage = image; |
| 261 | out: | 267 | return 0; |
| 262 | if (result == 0) | ||
| 263 | *rimage = image; | ||
| 264 | else | ||
| 265 | kfree(image); | ||
| 266 | 268 | ||
| 269 | out_free: | ||
| 270 | kimage_free_page_list(&image->control_pages); | ||
| 271 | kfree(image); | ||
| 272 | out: | ||
| 267 | return result; | 273 | return result; |
| 268 | } | 274 | } |
| 269 | 275 | ||
| @@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | |||
| 310 | mend = mstart + image->segment[i].memsz - 1; | 316 | mend = mstart + image->segment[i].memsz - 1; |
| 311 | /* Ensure we are within the crash kernel limits */ | 317 | /* Ensure we are within the crash kernel limits */ |
| 312 | if ((mstart < crashk_res.start) || (mend > crashk_res.end)) | 318 | if ((mstart < crashk_res.start) || (mend > crashk_res.end)) |
| 313 | goto out; | 319 | goto out_free; |
| 314 | } | 320 | } |
| 315 | 321 | ||
| 316 | /* | 322 | /* |
| @@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | |||
| 323 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 329 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
| 324 | if (!image->control_code_page) { | 330 | if (!image->control_code_page) { |
| 325 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | 331 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); |
| 326 | goto out; | 332 | goto out_free; |
| 327 | } | 333 | } |
| 328 | 334 | ||
| 329 | result = 0; | 335 | *rimage = image; |
| 330 | out: | 336 | return 0; |
| 331 | if (result == 0) | ||
| 332 | *rimage = image; | ||
| 333 | else | ||
| 334 | kfree(image); | ||
| 335 | 337 | ||
| 338 | out_free: | ||
| 339 | kfree(image); | ||
| 340 | out: | ||
| 336 | return result; | 341 | return result; |
| 337 | } | 342 | } |
| 338 | 343 | ||
| @@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | |||
| 497 | 502 | ||
| 498 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) | 503 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) |
| 499 | break; | 504 | break; |
| 500 | if (hole_end > crashk_res.end) | ||
| 501 | break; | ||
| 502 | /* See if I overlap any of the segments */ | 505 | /* See if I overlap any of the segments */ |
| 503 | for (i = 0; i < image->nr_segments; i++) { | 506 | for (i = 0; i < image->nr_segments; i++) { |
| 504 | unsigned long mstart, mend; | 507 | unsigned long mstart, mend; |
| @@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline, | |||
| 1369 | * That function is the entry point for command line parsing and should be | 1372 | * That function is the entry point for command line parsing and should be |
| 1370 | * called from the arch-specific code. | 1373 | * called from the arch-specific code. |
| 1371 | */ | 1374 | */ |
| 1372 | int __init parse_crashkernel(char *cmdline, | 1375 | static int __init __parse_crashkernel(char *cmdline, |
| 1373 | unsigned long long system_ram, | 1376 | unsigned long long system_ram, |
| 1374 | unsigned long long *crash_size, | 1377 | unsigned long long *crash_size, |
| 1375 | unsigned long long *crash_base) | 1378 | unsigned long long *crash_base, |
| 1379 | const char *name) | ||
| 1376 | { | 1380 | { |
| 1377 | char *p = cmdline, *ck_cmdline = NULL; | 1381 | char *p = cmdline, *ck_cmdline = NULL; |
| 1378 | char *first_colon, *first_space; | 1382 | char *first_colon, *first_space; |
| @@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline, | |||
| 1382 | *crash_base = 0; | 1386 | *crash_base = 0; |
| 1383 | 1387 | ||
| 1384 | /* find crashkernel and use the last one if there are more */ | 1388 | /* find crashkernel and use the last one if there are more */ |
| 1385 | p = strstr(p, "crashkernel="); | 1389 | p = strstr(p, name); |
| 1386 | while (p) { | 1390 | while (p) { |
| 1387 | ck_cmdline = p; | 1391 | ck_cmdline = p; |
| 1388 | p = strstr(p+1, "crashkernel="); | 1392 | p = strstr(p+1, name); |
| 1389 | } | 1393 | } |
| 1390 | 1394 | ||
| 1391 | if (!ck_cmdline) | 1395 | if (!ck_cmdline) |
| 1392 | return -EINVAL; | 1396 | return -EINVAL; |
| 1393 | 1397 | ||
| 1394 | ck_cmdline += 12; /* strlen("crashkernel=") */ | 1398 | ck_cmdline += strlen(name); |
| 1395 | 1399 | ||
| 1396 | /* | 1400 | /* |
| 1397 | * if the commandline contains a ':', then that's the extended | 1401 | * if the commandline contains a ':', then that's the extended |
| @@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline, | |||
| 1409 | return 0; | 1413 | return 0; |
| 1410 | } | 1414 | } |
| 1411 | 1415 | ||
| 1416 | int __init parse_crashkernel(char *cmdline, | ||
| 1417 | unsigned long long system_ram, | ||
| 1418 | unsigned long long *crash_size, | ||
| 1419 | unsigned long long *crash_base) | ||
| 1420 | { | ||
| 1421 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
| 1422 | "crashkernel="); | ||
| 1423 | } | ||
| 1424 | |||
| 1425 | int __init parse_crashkernel_low(char *cmdline, | ||
| 1426 | unsigned long long system_ram, | ||
| 1427 | unsigned long long *crash_size, | ||
| 1428 | unsigned long long *crash_base) | ||
| 1429 | { | ||
| 1430 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
| 1431 | "crashkernel_low="); | ||
| 1432 | } | ||
| 1412 | 1433 | ||
| 1413 | static void update_vmcoreinfo_note(void) | 1434 | static void update_vmcoreinfo_note(void) |
| 1414 | { | 1435 | { |
| @@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 1490 | VMCOREINFO_OFFSET(page, _count); | 1511 | VMCOREINFO_OFFSET(page, _count); |
| 1491 | VMCOREINFO_OFFSET(page, mapping); | 1512 | VMCOREINFO_OFFSET(page, mapping); |
| 1492 | VMCOREINFO_OFFSET(page, lru); | 1513 | VMCOREINFO_OFFSET(page, lru); |
| 1514 | VMCOREINFO_OFFSET(page, _mapcount); | ||
| 1515 | VMCOREINFO_OFFSET(page, private); | ||
| 1493 | VMCOREINFO_OFFSET(pglist_data, node_zones); | 1516 | VMCOREINFO_OFFSET(pglist_data, node_zones); |
| 1494 | VMCOREINFO_OFFSET(pglist_data, nr_zones); | 1517 | VMCOREINFO_OFFSET(pglist_data, nr_zones); |
| 1495 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 1518 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
| @@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 1512 | VMCOREINFO_NUMBER(PG_lru); | 1535 | VMCOREINFO_NUMBER(PG_lru); |
| 1513 | VMCOREINFO_NUMBER(PG_private); | 1536 | VMCOREINFO_NUMBER(PG_private); |
| 1514 | VMCOREINFO_NUMBER(PG_swapcache); | 1537 | VMCOREINFO_NUMBER(PG_swapcache); |
| 1538 | VMCOREINFO_NUMBER(PG_slab); | ||
| 1539 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 1540 | VMCOREINFO_NUMBER(PG_hwpoison); | ||
| 1541 | #endif | ||
| 1542 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | ||
| 1515 | 1543 | ||
| 1516 | arch_crash_save_vmcoreinfo(); | 1544 | arch_crash_save_vmcoreinfo(); |
| 1517 | update_vmcoreinfo_note(); | 1545 | update_vmcoreinfo_note(); |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c deleted file mode 100644 index 59dcf5b81d24..000000000000 --- a/kernel/kfifo.c +++ /dev/null | |||
| @@ -1,609 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * A generic kernel FIFO implementation | ||
| 3 | * | ||
| 4 | * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 19 | * | ||
| 20 | */ | ||
| 21 | |||
| 22 | #include <linux/kernel.h> | ||
| 23 | #include <linux/export.h> | ||
| 24 | #include <linux/slab.h> | ||
| 25 | #include <linux/err.h> | ||
| 26 | #include <linux/log2.h> | ||
| 27 | #include <linux/uaccess.h> | ||
| 28 | #include <linux/kfifo.h> | ||
| 29 | |||
| 30 | /* | ||
| 31 | * internal helper to calculate the unused elements in a fifo | ||
| 32 | */ | ||
| 33 | static inline unsigned int kfifo_unused(struct __kfifo *fifo) | ||
| 34 | { | ||
| 35 | return (fifo->mask + 1) - (fifo->in - fifo->out); | ||
| 36 | } | ||
| 37 | |||
| 38 | int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, | ||
| 39 | size_t esize, gfp_t gfp_mask) | ||
| 40 | { | ||
| 41 | /* | ||
| 42 | * round down to the next power of 2, since our 'let the indices | ||
| 43 | * wrap' technique works only in this case. | ||
| 44 | */ | ||
| 45 | if (!is_power_of_2(size)) | ||
| 46 | size = rounddown_pow_of_two(size); | ||
| 47 | |||
| 48 | fifo->in = 0; | ||
| 49 | fifo->out = 0; | ||
| 50 | fifo->esize = esize; | ||
| 51 | |||
| 52 | if (size < 2) { | ||
| 53 | fifo->data = NULL; | ||
| 54 | fifo->mask = 0; | ||
| 55 | return -EINVAL; | ||
| 56 | } | ||
| 57 | |||
| 58 | fifo->data = kmalloc(size * esize, gfp_mask); | ||
| 59 | |||
| 60 | if (!fifo->data) { | ||
| 61 | fifo->mask = 0; | ||
| 62 | return -ENOMEM; | ||
| 63 | } | ||
| 64 | fifo->mask = size - 1; | ||
| 65 | |||
| 66 | return 0; | ||
| 67 | } | ||
| 68 | EXPORT_SYMBOL(__kfifo_alloc); | ||
| 69 | |||
| 70 | void __kfifo_free(struct __kfifo *fifo) | ||
| 71 | { | ||
| 72 | kfree(fifo->data); | ||
| 73 | fifo->in = 0; | ||
| 74 | fifo->out = 0; | ||
| 75 | fifo->esize = 0; | ||
| 76 | fifo->data = NULL; | ||
| 77 | fifo->mask = 0; | ||
| 78 | } | ||
| 79 | EXPORT_SYMBOL(__kfifo_free); | ||
| 80 | |||
| 81 | int __kfifo_init(struct __kfifo *fifo, void *buffer, | ||
| 82 | unsigned int size, size_t esize) | ||
| 83 | { | ||
| 84 | size /= esize; | ||
| 85 | |||
| 86 | if (!is_power_of_2(size)) | ||
| 87 | size = rounddown_pow_of_two(size); | ||
| 88 | |||
| 89 | fifo->in = 0; | ||
| 90 | fifo->out = 0; | ||
| 91 | fifo->esize = esize; | ||
| 92 | fifo->data = buffer; | ||
| 93 | |||
| 94 | if (size < 2) { | ||
| 95 | fifo->mask = 0; | ||
| 96 | return -EINVAL; | ||
| 97 | } | ||
| 98 | fifo->mask = size - 1; | ||
| 99 | |||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | EXPORT_SYMBOL(__kfifo_init); | ||
| 103 | |||
| 104 | static void kfifo_copy_in(struct __kfifo *fifo, const void *src, | ||
| 105 | unsigned int len, unsigned int off) | ||
| 106 | { | ||
| 107 | unsigned int size = fifo->mask + 1; | ||
| 108 | unsigned int esize = fifo->esize; | ||
| 109 | unsigned int l; | ||
| 110 | |||
| 111 | off &= fifo->mask; | ||
| 112 | if (esize != 1) { | ||
| 113 | off *= esize; | ||
| 114 | size *= esize; | ||
| 115 | len *= esize; | ||
| 116 | } | ||
| 117 | l = min(len, size - off); | ||
| 118 | |||
| 119 | memcpy(fifo->data + off, src, l); | ||
| 120 | memcpy(fifo->data, src + l, len - l); | ||
| 121 | /* | ||
| 122 | * make sure that the data in the fifo is up to date before | ||
| 123 | * incrementing the fifo->in index counter | ||
| 124 | */ | ||
| 125 | smp_wmb(); | ||
| 126 | } | ||
| 127 | |||
| 128 | unsigned int __kfifo_in(struct __kfifo *fifo, | ||
| 129 | const void *buf, unsigned int len) | ||
| 130 | { | ||
| 131 | unsigned int l; | ||
| 132 | |||
| 133 | l = kfifo_unused(fifo); | ||
| 134 | if (len > l) | ||
| 135 | len = l; | ||
| 136 | |||
| 137 | kfifo_copy_in(fifo, buf, len, fifo->in); | ||
| 138 | fifo->in += len; | ||
| 139 | return len; | ||
| 140 | } | ||
| 141 | EXPORT_SYMBOL(__kfifo_in); | ||
| 142 | |||
| 143 | static void kfifo_copy_out(struct __kfifo *fifo, void *dst, | ||
| 144 | unsigned int len, unsigned int off) | ||
| 145 | { | ||
| 146 | unsigned int size = fifo->mask + 1; | ||
| 147 | unsigned int esize = fifo->esize; | ||
| 148 | unsigned int l; | ||
| 149 | |||
| 150 | off &= fifo->mask; | ||
| 151 | if (esize != 1) { | ||
| 152 | off *= esize; | ||
| 153 | size *= esize; | ||
| 154 | len *= esize; | ||
| 155 | } | ||
| 156 | l = min(len, size - off); | ||
| 157 | |||
| 158 | memcpy(dst, fifo->data + off, l); | ||
| 159 | memcpy(dst + l, fifo->data, len - l); | ||
| 160 | /* | ||
| 161 | * make sure that the data is copied before | ||
| 162 | * incrementing the fifo->out index counter | ||
| 163 | */ | ||
| 164 | smp_wmb(); | ||
| 165 | } | ||
| 166 | |||
| 167 | unsigned int __kfifo_out_peek(struct __kfifo *fifo, | ||
| 168 | void *buf, unsigned int len) | ||
| 169 | { | ||
| 170 | unsigned int l; | ||
| 171 | |||
| 172 | l = fifo->in - fifo->out; | ||
| 173 | if (len > l) | ||
| 174 | len = l; | ||
| 175 | |||
| 176 | kfifo_copy_out(fifo, buf, len, fifo->out); | ||
| 177 | return len; | ||
| 178 | } | ||
| 179 | EXPORT_SYMBOL(__kfifo_out_peek); | ||
| 180 | |||
| 181 | unsigned int __kfifo_out(struct __kfifo *fifo, | ||
| 182 | void *buf, unsigned int len) | ||
| 183 | { | ||
| 184 | len = __kfifo_out_peek(fifo, buf, len); | ||
| 185 | fifo->out += len; | ||
| 186 | return len; | ||
| 187 | } | ||
| 188 | EXPORT_SYMBOL(__kfifo_out); | ||
| 189 | |||
| 190 | static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, | ||
| 191 | const void __user *from, unsigned int len, unsigned int off, | ||
| 192 | unsigned int *copied) | ||
| 193 | { | ||
| 194 | unsigned int size = fifo->mask + 1; | ||
| 195 | unsigned int esize = fifo->esize; | ||
| 196 | unsigned int l; | ||
| 197 | unsigned long ret; | ||
| 198 | |||
| 199 | off &= fifo->mask; | ||
| 200 | if (esize != 1) { | ||
| 201 | off *= esize; | ||
| 202 | size *= esize; | ||
| 203 | len *= esize; | ||
| 204 | } | ||
| 205 | l = min(len, size - off); | ||
| 206 | |||
| 207 | ret = copy_from_user(fifo->data + off, from, l); | ||
| 208 | if (unlikely(ret)) | ||
| 209 | ret = DIV_ROUND_UP(ret + len - l, esize); | ||
| 210 | else { | ||
| 211 | ret = copy_from_user(fifo->data, from + l, len - l); | ||
| 212 | if (unlikely(ret)) | ||
| 213 | ret = DIV_ROUND_UP(ret, esize); | ||
| 214 | } | ||
| 215 | /* | ||
| 216 | * make sure that the data in the fifo is up to date before | ||
| 217 | * incrementing the fifo->in index counter | ||
| 218 | */ | ||
| 219 | smp_wmb(); | ||
| 220 | *copied = len - ret; | ||
| 221 | /* return the number of elements which are not copied */ | ||
| 222 | return ret; | ||
| 223 | } | ||
| 224 | |||
| 225 | int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, | ||
| 226 | unsigned long len, unsigned int *copied) | ||
| 227 | { | ||
| 228 | unsigned int l; | ||
| 229 | unsigned long ret; | ||
| 230 | unsigned int esize = fifo->esize; | ||
| 231 | int err; | ||
| 232 | |||
| 233 | if (esize != 1) | ||
| 234 | len /= esize; | ||
| 235 | |||
| 236 | l = kfifo_unused(fifo); | ||
| 237 | if (len > l) | ||
| 238 | len = l; | ||
| 239 | |||
| 240 | ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); | ||
| 241 | if (unlikely(ret)) { | ||
| 242 | len -= ret; | ||
| 243 | err = -EFAULT; | ||
| 244 | } else | ||
| 245 | err = 0; | ||
| 246 | fifo->in += len; | ||
| 247 | return err; | ||
| 248 | } | ||
| 249 | EXPORT_SYMBOL(__kfifo_from_user); | ||
| 250 | |||
| 251 | static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, | ||
| 252 | unsigned int len, unsigned int off, unsigned int *copied) | ||
| 253 | { | ||
| 254 | unsigned int l; | ||
| 255 | unsigned long ret; | ||
| 256 | unsigned int size = fifo->mask + 1; | ||
| 257 | unsigned int esize = fifo->esize; | ||
| 258 | |||
| 259 | off &= fifo->mask; | ||
| 260 | if (esize != 1) { | ||
| 261 | off *= esize; | ||
| 262 | size *= esize; | ||
| 263 | len *= esize; | ||
| 264 | } | ||
| 265 | l = min(len, size - off); | ||
| 266 | |||
| 267 | ret = copy_to_user(to, fifo->data + off, l); | ||
| 268 | if (unlikely(ret)) | ||
| 269 | ret = DIV_ROUND_UP(ret + len - l, esize); | ||
| 270 | else { | ||
| 271 | ret = copy_to_user(to + l, fifo->data, len - l); | ||
| 272 | if (unlikely(ret)) | ||
| 273 | ret = DIV_ROUND_UP(ret, esize); | ||
| 274 | } | ||
| 275 | /* | ||
| 276 | * make sure that the data is copied before | ||
| 277 | * incrementing the fifo->out index counter | ||
| 278 | */ | ||
| 279 | smp_wmb(); | ||
| 280 | *copied = len - ret; | ||
| 281 | /* return the number of elements which are not copied */ | ||
| 282 | return ret; | ||
| 283 | } | ||
| 284 | |||
| 285 | int __kfifo_to_user(struct __kfifo *fifo, void __user *to, | ||
| 286 | unsigned long len, unsigned int *copied) | ||
| 287 | { | ||
| 288 | unsigned int l; | ||
| 289 | unsigned long ret; | ||
| 290 | unsigned int esize = fifo->esize; | ||
| 291 | int err; | ||
| 292 | |||
| 293 | if (esize != 1) | ||
| 294 | len /= esize; | ||
| 295 | |||
| 296 | l = fifo->in - fifo->out; | ||
| 297 | if (len > l) | ||
| 298 | len = l; | ||
| 299 | ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); | ||
| 300 | if (unlikely(ret)) { | ||
| 301 | len -= ret; | ||
| 302 | err = -EFAULT; | ||
| 303 | } else | ||
| 304 | err = 0; | ||
| 305 | fifo->out += len; | ||
| 306 | return err; | ||
| 307 | } | ||
| 308 | EXPORT_SYMBOL(__kfifo_to_user); | ||
| 309 | |||
| 310 | static int setup_sgl_buf(struct scatterlist *sgl, void *buf, | ||
| 311 | int nents, unsigned int len) | ||
| 312 | { | ||
| 313 | int n; | ||
| 314 | unsigned int l; | ||
| 315 | unsigned int off; | ||
| 316 | struct page *page; | ||
| 317 | |||
| 318 | if (!nents) | ||
| 319 | return 0; | ||
| 320 | |||
| 321 | if (!len) | ||
| 322 | return 0; | ||
| 323 | |||
| 324 | n = 0; | ||
| 325 | page = virt_to_page(buf); | ||
| 326 | off = offset_in_page(buf); | ||
| 327 | l = 0; | ||
| 328 | |||
| 329 | while (len >= l + PAGE_SIZE - off) { | ||
| 330 | struct page *npage; | ||
| 331 | |||
| 332 | l += PAGE_SIZE; | ||
| 333 | buf += PAGE_SIZE; | ||
| 334 | npage = virt_to_page(buf); | ||
| 335 | if (page_to_phys(page) != page_to_phys(npage) - l) { | ||
| 336 | sg_set_page(sgl, page, l - off, off); | ||
| 337 | sgl = sg_next(sgl); | ||
| 338 | if (++n == nents || sgl == NULL) | ||
| 339 | return n; | ||
| 340 | page = npage; | ||
| 341 | len -= l - off; | ||
| 342 | l = off = 0; | ||
| 343 | } | ||
| 344 | } | ||
| 345 | sg_set_page(sgl, page, len, off); | ||
| 346 | return n + 1; | ||
| 347 | } | ||
| 348 | |||
| 349 | static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, | ||
| 350 | int nents, unsigned int len, unsigned int off) | ||
| 351 | { | ||
| 352 | unsigned int size = fifo->mask + 1; | ||
| 353 | unsigned int esize = fifo->esize; | ||
| 354 | unsigned int l; | ||
| 355 | unsigned int n; | ||
| 356 | |||
| 357 | off &= fifo->mask; | ||
| 358 | if (esize != 1) { | ||
| 359 | off *= esize; | ||
| 360 | size *= esize; | ||
| 361 | len *= esize; | ||
| 362 | } | ||
| 363 | l = min(len, size - off); | ||
| 364 | |||
| 365 | n = setup_sgl_buf(sgl, fifo->data + off, nents, l); | ||
| 366 | n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); | ||
| 367 | |||
| 368 | return n; | ||
| 369 | } | ||
| 370 | |||
| 371 | unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, | ||
| 372 | struct scatterlist *sgl, int nents, unsigned int len) | ||
| 373 | { | ||
| 374 | unsigned int l; | ||
| 375 | |||
| 376 | l = kfifo_unused(fifo); | ||
| 377 | if (len > l) | ||
| 378 | len = l; | ||
| 379 | |||
| 380 | return setup_sgl(fifo, sgl, nents, len, fifo->in); | ||
| 381 | } | ||
| 382 | EXPORT_SYMBOL(__kfifo_dma_in_prepare); | ||
| 383 | |||
| 384 | unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, | ||
| 385 | struct scatterlist *sgl, int nents, unsigned int len) | ||
| 386 | { | ||
| 387 | unsigned int l; | ||
| 388 | |||
| 389 | l = fifo->in - fifo->out; | ||
| 390 | if (len > l) | ||
| 391 | len = l; | ||
| 392 | |||
| 393 | return setup_sgl(fifo, sgl, nents, len, fifo->out); | ||
| 394 | } | ||
| 395 | EXPORT_SYMBOL(__kfifo_dma_out_prepare); | ||
| 396 | |||
| 397 | unsigned int __kfifo_max_r(unsigned int len, size_t recsize) | ||
| 398 | { | ||
| 399 | unsigned int max = (1 << (recsize << 3)) - 1; | ||
| 400 | |||
| 401 | if (len > max) | ||
| 402 | return max; | ||
| 403 | return len; | ||
| 404 | } | ||
| 405 | EXPORT_SYMBOL(__kfifo_max_r); | ||
| 406 | |||
| 407 | #define __KFIFO_PEEK(data, out, mask) \ | ||
| 408 | ((data)[(out) & (mask)]) | ||
| 409 | /* | ||
| 410 | * __kfifo_peek_n internal helper function for determinate the length of | ||
| 411 | * the next record in the fifo | ||
| 412 | */ | ||
| 413 | static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) | ||
| 414 | { | ||
| 415 | unsigned int l; | ||
| 416 | unsigned int mask = fifo->mask; | ||
| 417 | unsigned char *data = fifo->data; | ||
| 418 | |||
| 419 | l = __KFIFO_PEEK(data, fifo->out, mask); | ||
| 420 | |||
| 421 | if (--recsize) | ||
| 422 | l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; | ||
| 423 | |||
| 424 | return l; | ||
| 425 | } | ||
| 426 | |||
| 427 | #define __KFIFO_POKE(data, in, mask, val) \ | ||
| 428 | ( \ | ||
| 429 | (data)[(in) & (mask)] = (unsigned char)(val) \ | ||
| 430 | ) | ||
| 431 | |||
| 432 | /* | ||
| 433 | * __kfifo_poke_n internal helper function for storeing the length of | ||
| 434 | * the record into the fifo | ||
| 435 | */ | ||
| 436 | static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) | ||
| 437 | { | ||
| 438 | unsigned int mask = fifo->mask; | ||
| 439 | unsigned char *data = fifo->data; | ||
| 440 | |||
| 441 | __KFIFO_POKE(data, fifo->in, mask, n); | ||
| 442 | |||
| 443 | if (recsize > 1) | ||
| 444 | __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); | ||
| 445 | } | ||
| 446 | |||
| 447 | unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) | ||
| 448 | { | ||
| 449 | return __kfifo_peek_n(fifo, recsize); | ||
| 450 | } | ||
| 451 | EXPORT_SYMBOL(__kfifo_len_r); | ||
| 452 | |||
| 453 | unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, | ||
| 454 | unsigned int len, size_t recsize) | ||
| 455 | { | ||
| 456 | if (len + recsize > kfifo_unused(fifo)) | ||
| 457 | return 0; | ||
| 458 | |||
| 459 | __kfifo_poke_n(fifo, len, recsize); | ||
| 460 | |||
| 461 | kfifo_copy_in(fifo, buf, len, fifo->in + recsize); | ||
| 462 | fifo->in += len + recsize; | ||
| 463 | return len; | ||
| 464 | } | ||
| 465 | EXPORT_SYMBOL(__kfifo_in_r); | ||
| 466 | |||
| 467 | static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, | ||
| 468 | void *buf, unsigned int len, size_t recsize, unsigned int *n) | ||
| 469 | { | ||
| 470 | *n = __kfifo_peek_n(fifo, recsize); | ||
| 471 | |||
| 472 | if (len > *n) | ||
| 473 | len = *n; | ||
| 474 | |||
| 475 | kfifo_copy_out(fifo, buf, len, fifo->out + recsize); | ||
| 476 | return len; | ||
| 477 | } | ||
| 478 | |||
| 479 | unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, | ||
| 480 | unsigned int len, size_t recsize) | ||
| 481 | { | ||
| 482 | unsigned int n; | ||
| 483 | |||
| 484 | if (fifo->in == fifo->out) | ||
| 485 | return 0; | ||
| 486 | |||
| 487 | return kfifo_out_copy_r(fifo, buf, len, recsize, &n); | ||
| 488 | } | ||
| 489 | EXPORT_SYMBOL(__kfifo_out_peek_r); | ||
| 490 | |||
| 491 | unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, | ||
| 492 | unsigned int len, size_t recsize) | ||
| 493 | { | ||
| 494 | unsigned int n; | ||
| 495 | |||
| 496 | if (fifo->in == fifo->out) | ||
| 497 | return 0; | ||
| 498 | |||
| 499 | len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); | ||
| 500 | fifo->out += n + recsize; | ||
| 501 | return len; | ||
| 502 | } | ||
| 503 | EXPORT_SYMBOL(__kfifo_out_r); | ||
| 504 | |||
| 505 | void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) | ||
| 506 | { | ||
| 507 | unsigned int n; | ||
| 508 | |||
| 509 | n = __kfifo_peek_n(fifo, recsize); | ||
| 510 | fifo->out += n + recsize; | ||
| 511 | } | ||
| 512 | EXPORT_SYMBOL(__kfifo_skip_r); | ||
| 513 | |||
| 514 | int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, | ||
| 515 | unsigned long len, unsigned int *copied, size_t recsize) | ||
| 516 | { | ||
| 517 | unsigned long ret; | ||
| 518 | |||
| 519 | len = __kfifo_max_r(len, recsize); | ||
| 520 | |||
| 521 | if (len + recsize > kfifo_unused(fifo)) { | ||
| 522 | *copied = 0; | ||
| 523 | return 0; | ||
| 524 | } | ||
| 525 | |||
| 526 | __kfifo_poke_n(fifo, len, recsize); | ||
| 527 | |||
| 528 | ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); | ||
| 529 | if (unlikely(ret)) { | ||
| 530 | *copied = 0; | ||
| 531 | return -EFAULT; | ||
| 532 | } | ||
| 533 | fifo->in += len + recsize; | ||
| 534 | return 0; | ||
| 535 | } | ||
| 536 | EXPORT_SYMBOL(__kfifo_from_user_r); | ||
| 537 | |||
| 538 | int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, | ||
| 539 | unsigned long len, unsigned int *copied, size_t recsize) | ||
| 540 | { | ||
| 541 | unsigned long ret; | ||
| 542 | unsigned int n; | ||
| 543 | |||
| 544 | if (fifo->in == fifo->out) { | ||
| 545 | *copied = 0; | ||
| 546 | return 0; | ||
| 547 | } | ||
| 548 | |||
| 549 | n = __kfifo_peek_n(fifo, recsize); | ||
| 550 | if (len > n) | ||
| 551 | len = n; | ||
| 552 | |||
| 553 | ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); | ||
| 554 | if (unlikely(ret)) { | ||
| 555 | *copied = 0; | ||
| 556 | return -EFAULT; | ||
| 557 | } | ||
| 558 | fifo->out += n + recsize; | ||
| 559 | return 0; | ||
| 560 | } | ||
| 561 | EXPORT_SYMBOL(__kfifo_to_user_r); | ||
| 562 | |||
| 563 | unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, | ||
| 564 | struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) | ||
| 565 | { | ||
| 566 | if (!nents) | ||
| 567 | BUG(); | ||
| 568 | |||
| 569 | len = __kfifo_max_r(len, recsize); | ||
| 570 | |||
| 571 | if (len + recsize > kfifo_unused(fifo)) | ||
| 572 | return 0; | ||
| 573 | |||
| 574 | return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); | ||
| 575 | } | ||
| 576 | EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); | ||
| 577 | |||
| 578 | void __kfifo_dma_in_finish_r(struct __kfifo *fifo, | ||
| 579 | unsigned int len, size_t recsize) | ||
| 580 | { | ||
| 581 | len = __kfifo_max_r(len, recsize); | ||
| 582 | __kfifo_poke_n(fifo, len, recsize); | ||
| 583 | fifo->in += len + recsize; | ||
| 584 | } | ||
| 585 | EXPORT_SYMBOL(__kfifo_dma_in_finish_r); | ||
| 586 | |||
| 587 | unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, | ||
| 588 | struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) | ||
| 589 | { | ||
| 590 | if (!nents) | ||
| 591 | BUG(); | ||
| 592 | |||
| 593 | len = __kfifo_max_r(len, recsize); | ||
| 594 | |||
| 595 | if (len + recsize > fifo->in - fifo->out) | ||
| 596 | return 0; | ||
| 597 | |||
| 598 | return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); | ||
| 599 | } | ||
| 600 | EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); | ||
| 601 | |||
| 602 | void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) | ||
| 603 | { | ||
| 604 | unsigned int len; | ||
| 605 | |||
| 606 | len = __kfifo_peek_n(fifo, recsize); | ||
| 607 | fifo->out += len + recsize; | ||
| 608 | } | ||
| 609 | EXPORT_SYMBOL(__kfifo_dma_out_finish_r); | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 0023a87e8de6..56dd34976d7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/suspend.h> | 38 | #include <linux/suspend.h> |
| 39 | #include <linux/rwsem.h> | 39 | #include <linux/rwsem.h> |
| 40 | #include <linux/ptrace.h> | 40 | #include <linux/ptrace.h> |
| 41 | #include <linux/async.h> | ||
| 41 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
| 42 | 43 | ||
| 43 | #include <trace/events/module.h> | 44 | #include <trace/events/module.h> |
| @@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 130 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ | 131 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ |
| 131 | static int kmod_loop_msg; | 132 | static int kmod_loop_msg; |
| 132 | 133 | ||
| 134 | /* | ||
| 135 | * We don't allow synchronous module loading from async. Module | ||
| 136 | * init may invoke async_synchronize_full() which will end up | ||
| 137 | * waiting for this task which already is waiting for the module | ||
| 138 | * loading to complete, leading to a deadlock. | ||
| 139 | */ | ||
| 140 | WARN_ON_ONCE(wait && current_is_async()); | ||
| 141 | |||
| 133 | va_start(args, fmt); | 142 | va_start(args, fmt); |
| 134 | ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); | 143 | ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); |
| 135 | va_end(args); | 144 | va_end(args); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa409..e35be53f6613 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void) | |||
| 334 | struct kprobe __kprobes *get_kprobe(void *addr) | 334 | struct kprobe __kprobes *get_kprobe(void *addr) |
| 335 | { | 335 | { |
| 336 | struct hlist_head *head; | 336 | struct hlist_head *head; |
| 337 | struct hlist_node *node; | ||
| 338 | struct kprobe *p; | 337 | struct kprobe *p; |
| 339 | 338 | ||
| 340 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; | 339 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; |
| 341 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 340 | hlist_for_each_entry_rcu(p, head, hlist) { |
| 342 | if (p->addr == addr) | 341 | if (p->addr == addr) |
| 343 | return p; | 342 | return p; |
| 344 | } | 343 | } |
| @@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list); | |||
| 471 | 470 | ||
| 472 | static void kprobe_optimizer(struct work_struct *work); | 471 | static void kprobe_optimizer(struct work_struct *work); |
| 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 472 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
| 474 | static DECLARE_COMPLETION(optimizer_comp); | ||
| 475 | #define OPTIMIZE_DELAY 5 | 473 | #define OPTIMIZE_DELAY 5 |
| 476 | 474 | ||
| 477 | /* | 475 | /* |
| @@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | |||
| 552 | /* Start optimizer after OPTIMIZE_DELAY passed */ | 550 | /* Start optimizer after OPTIMIZE_DELAY passed */ |
| 553 | static __kprobes void kick_kprobe_optimizer(void) | 551 | static __kprobes void kick_kprobe_optimizer(void) |
| 554 | { | 552 | { |
| 555 | if (!delayed_work_pending(&optimizing_work)) | 553 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); |
| 556 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
| 557 | } | 554 | } |
| 558 | 555 | ||
| 559 | /* Kprobe jump optimizer */ | 556 | /* Kprobe jump optimizer */ |
| @@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
| 592 | /* Step 5: Kick optimizer again if needed */ | 589 | /* Step 5: Kick optimizer again if needed */ |
| 593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | 590 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) |
| 594 | kick_kprobe_optimizer(); | 591 | kick_kprobe_optimizer(); |
| 595 | else | ||
| 596 | /* Wake up all waiters */ | ||
| 597 | complete_all(&optimizer_comp); | ||
| 598 | } | 592 | } |
| 599 | 593 | ||
| 600 | /* Wait for completing optimization and unoptimization */ | 594 | /* Wait for completing optimization and unoptimization */ |
| 601 | static __kprobes void wait_for_kprobe_optimizer(void) | 595 | static __kprobes void wait_for_kprobe_optimizer(void) |
| 602 | { | 596 | { |
| 603 | if (delayed_work_pending(&optimizing_work)) | 597 | mutex_lock(&kprobe_mutex); |
| 604 | wait_for_completion(&optimizer_comp); | 598 | |
| 599 | while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { | ||
| 600 | mutex_unlock(&kprobe_mutex); | ||
| 601 | |||
| 602 | /* this will also make optimizing_work execute immmediately */ | ||
| 603 | flush_delayed_work(&optimizing_work); | ||
| 604 | /* @optimizing_work might not have been queued yet, relax */ | ||
| 605 | cpu_relax(); | ||
| 606 | |||
| 607 | mutex_lock(&kprobe_mutex); | ||
| 608 | } | ||
| 609 | |||
| 610 | mutex_unlock(&kprobe_mutex); | ||
| 605 | } | 611 | } |
| 606 | 612 | ||
| 607 | /* Optimize kprobe if p is ready to be optimized */ | 613 | /* Optimize kprobe if p is ready to be optimized */ |
| @@ -792,7 +798,6 @@ out: | |||
| 792 | static void __kprobes optimize_all_kprobes(void) | 798 | static void __kprobes optimize_all_kprobes(void) |
| 793 | { | 799 | { |
| 794 | struct hlist_head *head; | 800 | struct hlist_head *head; |
| 795 | struct hlist_node *node; | ||
| 796 | struct kprobe *p; | 801 | struct kprobe *p; |
| 797 | unsigned int i; | 802 | unsigned int i; |
| 798 | 803 | ||
| @@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void) | |||
| 803 | kprobes_allow_optimization = true; | 808 | kprobes_allow_optimization = true; |
| 804 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 809 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 805 | head = &kprobe_table[i]; | 810 | head = &kprobe_table[i]; |
| 806 | hlist_for_each_entry_rcu(p, node, head, hlist) | 811 | hlist_for_each_entry_rcu(p, head, hlist) |
| 807 | if (!kprobe_disabled(p)) | 812 | if (!kprobe_disabled(p)) |
| 808 | optimize_kprobe(p); | 813 | optimize_kprobe(p); |
| 809 | } | 814 | } |
| @@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void) | |||
| 814 | static void __kprobes unoptimize_all_kprobes(void) | 819 | static void __kprobes unoptimize_all_kprobes(void) |
| 815 | { | 820 | { |
| 816 | struct hlist_head *head; | 821 | struct hlist_head *head; |
| 817 | struct hlist_node *node; | ||
| 818 | struct kprobe *p; | 822 | struct kprobe *p; |
| 819 | unsigned int i; | 823 | unsigned int i; |
| 820 | 824 | ||
| @@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
| 825 | kprobes_allow_optimization = false; | 829 | kprobes_allow_optimization = false; |
| 826 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 830 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 827 | head = &kprobe_table[i]; | 831 | head = &kprobe_table[i]; |
| 828 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 832 | hlist_for_each_entry_rcu(p, head, hlist) { |
| 829 | if (!kprobe_disabled(p)) | 833 | if (!kprobe_disabled(p)) |
| 830 | unoptimize_kprobe(p, false); | 834 | unoptimize_kprobe(p, false); |
| 831 | } | 835 | } |
| @@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
| 919 | } | 923 | } |
| 920 | #endif /* CONFIG_OPTPROBES */ | 924 | #endif /* CONFIG_OPTPROBES */ |
| 921 | 925 | ||
| 922 | #ifdef KPROBES_CAN_USE_FTRACE | 926 | #ifdef CONFIG_KPROBES_ON_FTRACE |
| 923 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { | 927 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { |
| 924 | .func = kprobe_ftrace_handler, | 928 | .func = kprobe_ftrace_handler, |
| 925 | .flags = FTRACE_OPS_FL_SAVE_REGS, | 929 | .flags = FTRACE_OPS_FL_SAVE_REGS, |
| @@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) | |||
| 964 | (unsigned long)p->addr, 1, 0); | 968 | (unsigned long)p->addr, 1, 0); |
| 965 | WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); | 969 | WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); |
| 966 | } | 970 | } |
| 967 | #else /* !KPROBES_CAN_USE_FTRACE */ | 971 | #else /* !CONFIG_KPROBES_ON_FTRACE */ |
| 968 | #define prepare_kprobe(p) arch_prepare_kprobe(p) | 972 | #define prepare_kprobe(p) arch_prepare_kprobe(p) |
| 969 | #define arm_kprobe_ftrace(p) do {} while (0) | 973 | #define arm_kprobe_ftrace(p) do {} while (0) |
| 970 | #define disarm_kprobe_ftrace(p) do {} while (0) | 974 | #define disarm_kprobe_ftrace(p) do {} while (0) |
| @@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
| 1141 | { | 1145 | { |
| 1142 | struct kretprobe_instance *ri; | 1146 | struct kretprobe_instance *ri; |
| 1143 | struct hlist_head *head, empty_rp; | 1147 | struct hlist_head *head, empty_rp; |
| 1144 | struct hlist_node *node, *tmp; | 1148 | struct hlist_node *tmp; |
| 1145 | unsigned long hash, flags = 0; | 1149 | unsigned long hash, flags = 0; |
| 1146 | 1150 | ||
| 1147 | if (unlikely(!kprobes_initialized)) | 1151 | if (unlikely(!kprobes_initialized)) |
| @@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
| 1152 | hash = hash_ptr(tk, KPROBE_HASH_BITS); | 1156 | hash = hash_ptr(tk, KPROBE_HASH_BITS); |
| 1153 | head = &kretprobe_inst_table[hash]; | 1157 | head = &kretprobe_inst_table[hash]; |
| 1154 | kretprobe_table_lock(hash, &flags); | 1158 | kretprobe_table_lock(hash, &flags); |
| 1155 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | 1159 | hlist_for_each_entry_safe(ri, tmp, head, hlist) { |
| 1156 | if (ri->task == tk) | 1160 | if (ri->task == tk) |
| 1157 | recycle_rp_inst(ri, &empty_rp); | 1161 | recycle_rp_inst(ri, &empty_rp); |
| 1158 | } | 1162 | } |
| 1159 | kretprobe_table_unlock(hash, &flags); | 1163 | kretprobe_table_unlock(hash, &flags); |
| 1160 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | 1164 | hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { |
| 1161 | hlist_del(&ri->hlist); | 1165 | hlist_del(&ri->hlist); |
| 1162 | kfree(ri); | 1166 | kfree(ri); |
| 1163 | } | 1167 | } |
| @@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
| 1166 | static inline void free_rp_inst(struct kretprobe *rp) | 1170 | static inline void free_rp_inst(struct kretprobe *rp) |
| 1167 | { | 1171 | { |
| 1168 | struct kretprobe_instance *ri; | 1172 | struct kretprobe_instance *ri; |
| 1169 | struct hlist_node *pos, *next; | 1173 | struct hlist_node *next; |
| 1170 | 1174 | ||
| 1171 | hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { | 1175 | hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { |
| 1172 | hlist_del(&ri->hlist); | 1176 | hlist_del(&ri->hlist); |
| 1173 | kfree(ri); | 1177 | kfree(ri); |
| 1174 | } | 1178 | } |
| @@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | |||
| 1178 | { | 1182 | { |
| 1179 | unsigned long flags, hash; | 1183 | unsigned long flags, hash; |
| 1180 | struct kretprobe_instance *ri; | 1184 | struct kretprobe_instance *ri; |
| 1181 | struct hlist_node *pos, *next; | 1185 | struct hlist_node *next; |
| 1182 | struct hlist_head *head; | 1186 | struct hlist_head *head; |
| 1183 | 1187 | ||
| 1184 | /* No race here */ | 1188 | /* No race here */ |
| 1185 | for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { | 1189 | for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { |
| 1186 | kretprobe_table_lock(hash, &flags); | 1190 | kretprobe_table_lock(hash, &flags); |
| 1187 | head = &kretprobe_inst_table[hash]; | 1191 | head = &kretprobe_inst_table[hash]; |
| 1188 | hlist_for_each_entry_safe(ri, pos, next, head, hlist) { | 1192 | hlist_for_each_entry_safe(ri, next, head, hlist) { |
| 1189 | if (ri->rp == rp) | 1193 | if (ri->rp == rp) |
| 1190 | ri->rp = NULL; | 1194 | ri->rp = NULL; |
| 1191 | } | 1195 | } |
| @@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, | |||
| 1414 | */ | 1418 | */ |
| 1415 | ftrace_addr = ftrace_location((unsigned long)p->addr); | 1419 | ftrace_addr = ftrace_location((unsigned long)p->addr); |
| 1416 | if (ftrace_addr) { | 1420 | if (ftrace_addr) { |
| 1417 | #ifdef KPROBES_CAN_USE_FTRACE | 1421 | #ifdef CONFIG_KPROBES_ON_FTRACE |
| 1418 | /* Given address is not on the instruction boundary */ | 1422 | /* Given address is not on the instruction boundary */ |
| 1419 | if ((unsigned long)p->addr != ftrace_addr) | 1423 | if ((unsigned long)p->addr != ftrace_addr) |
| 1420 | return -EILSEQ; | 1424 | return -EILSEQ; |
| 1421 | p->flags |= KPROBE_FLAG_FTRACE; | 1425 | p->flags |= KPROBE_FLAG_FTRACE; |
| 1422 | #else /* !KPROBES_CAN_USE_FTRACE */ | 1426 | #else /* !CONFIG_KPROBES_ON_FTRACE */ |
| 1423 | return -EINVAL; | 1427 | return -EINVAL; |
| 1424 | #endif | 1428 | #endif |
| 1425 | } | 1429 | } |
| @@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, | |||
| 2021 | { | 2025 | { |
| 2022 | struct module *mod = data; | 2026 | struct module *mod = data; |
| 2023 | struct hlist_head *head; | 2027 | struct hlist_head *head; |
| 2024 | struct hlist_node *node; | ||
| 2025 | struct kprobe *p; | 2028 | struct kprobe *p; |
| 2026 | unsigned int i; | 2029 | unsigned int i; |
| 2027 | int checkcore = (val == MODULE_STATE_GOING); | 2030 | int checkcore = (val == MODULE_STATE_GOING); |
| @@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, | |||
| 2038 | mutex_lock(&kprobe_mutex); | 2041 | mutex_lock(&kprobe_mutex); |
| 2039 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2042 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 2040 | head = &kprobe_table[i]; | 2043 | head = &kprobe_table[i]; |
| 2041 | hlist_for_each_entry_rcu(p, node, head, hlist) | 2044 | hlist_for_each_entry_rcu(p, head, hlist) |
| 2042 | if (within_module_init((unsigned long)p->addr, mod) || | 2045 | if (within_module_init((unsigned long)p->addr, mod) || |
| 2043 | (checkcore && | 2046 | (checkcore && |
| 2044 | within_module_core((unsigned long)p->addr, mod))) { | 2047 | within_module_core((unsigned long)p->addr, mod))) { |
| @@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) | |||
| 2185 | static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | 2188 | static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) |
| 2186 | { | 2189 | { |
| 2187 | struct hlist_head *head; | 2190 | struct hlist_head *head; |
| 2188 | struct hlist_node *node; | ||
| 2189 | struct kprobe *p, *kp; | 2191 | struct kprobe *p, *kp; |
| 2190 | const char *sym = NULL; | 2192 | const char *sym = NULL; |
| 2191 | unsigned int i = *(loff_t *) v; | 2193 | unsigned int i = *(loff_t *) v; |
| @@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | |||
| 2194 | 2196 | ||
| 2195 | head = &kprobe_table[i]; | 2197 | head = &kprobe_table[i]; |
| 2196 | preempt_disable(); | 2198 | preempt_disable(); |
| 2197 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2199 | hlist_for_each_entry_rcu(p, head, hlist) { |
| 2198 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, | 2200 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, |
| 2199 | &offset, &modname, namebuf); | 2201 | &offset, &modname, namebuf); |
| 2200 | if (kprobe_aggrprobe(p)) { | 2202 | if (kprobe_aggrprobe(p)) { |
| @@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = { | |||
| 2229 | static void __kprobes arm_all_kprobes(void) | 2231 | static void __kprobes arm_all_kprobes(void) |
| 2230 | { | 2232 | { |
| 2231 | struct hlist_head *head; | 2233 | struct hlist_head *head; |
| 2232 | struct hlist_node *node; | ||
| 2233 | struct kprobe *p; | 2234 | struct kprobe *p; |
| 2234 | unsigned int i; | 2235 | unsigned int i; |
| 2235 | 2236 | ||
| @@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void) | |||
| 2242 | /* Arming kprobes doesn't optimize kprobe itself */ | 2243 | /* Arming kprobes doesn't optimize kprobe itself */ |
| 2243 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2244 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 2244 | head = &kprobe_table[i]; | 2245 | head = &kprobe_table[i]; |
| 2245 | hlist_for_each_entry_rcu(p, node, head, hlist) | 2246 | hlist_for_each_entry_rcu(p, head, hlist) |
| 2246 | if (!kprobe_disabled(p)) | 2247 | if (!kprobe_disabled(p)) |
| 2247 | arm_kprobe(p); | 2248 | arm_kprobe(p); |
| 2248 | } | 2249 | } |
| @@ -2258,7 +2259,6 @@ already_enabled: | |||
| 2258 | static void __kprobes disarm_all_kprobes(void) | 2259 | static void __kprobes disarm_all_kprobes(void) |
| 2259 | { | 2260 | { |
| 2260 | struct hlist_head *head; | 2261 | struct hlist_head *head; |
| 2261 | struct hlist_node *node; | ||
| 2262 | struct kprobe *p; | 2262 | struct kprobe *p; |
| 2263 | unsigned int i; | 2263 | unsigned int i; |
| 2264 | 2264 | ||
| @@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void) | |||
| 2275 | 2275 | ||
| 2276 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2276 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 2277 | head = &kprobe_table[i]; | 2277 | head = &kprobe_table[i]; |
| 2278 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2278 | hlist_for_each_entry_rcu(p, head, hlist) { |
| 2279 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2279 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
| 2280 | disarm_kprobe(p, false); | 2280 | disarm_kprobe(p, false); |
| 2281 | } | 2281 | } |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7981e5b2350d..259db207b5d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3190 | #endif | 3190 | #endif |
| 3191 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { | 3191 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { |
| 3192 | debug_locks_off(); | 3192 | debug_locks_off(); |
| 3193 | printk("BUG: MAX_LOCK_DEPTH too low!\n"); | 3193 | printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", |
| 3194 | curr->lockdep_depth, MAX_LOCK_DEPTH); | ||
| 3194 | printk("turning off the locking correctness validator.\n"); | 3195 | printk("turning off the locking correctness validator.\n"); |
| 3196 | |||
| 3197 | lockdep_print_held_locks(current); | ||
| 3198 | debug_show_all_locks(); | ||
| 3195 | dump_stack(); | 3199 | dump_stack(); |
| 3200 | |||
| 3196 | return 0; | 3201 | return 0; |
| 3197 | } | 3202 | } |
| 3198 | 3203 | ||
| @@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3203 | } | 3208 | } |
| 3204 | 3209 | ||
| 3205 | static int | 3210 | static int |
| 3206 | print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | 3211 | print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, |
| 3207 | unsigned long ip) | 3212 | unsigned long ip) |
| 3208 | { | 3213 | { |
| 3209 | if (!debug_locks_off()) | 3214 | if (!debug_locks_off()) |
| @@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, | |||
| 3246 | return 0; | 3251 | return 0; |
| 3247 | 3252 | ||
| 3248 | if (curr->lockdep_depth <= 0) | 3253 | if (curr->lockdep_depth <= 0) |
| 3249 | return print_unlock_inbalance_bug(curr, lock, ip); | 3254 | return print_unlock_imbalance_bug(curr, lock, ip); |
| 3250 | 3255 | ||
| 3251 | return 1; | 3256 | return 1; |
| 3252 | } | 3257 | } |
| @@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
| 3317 | goto found_it; | 3322 | goto found_it; |
| 3318 | prev_hlock = hlock; | 3323 | prev_hlock = hlock; |
| 3319 | } | 3324 | } |
| 3320 | return print_unlock_inbalance_bug(curr, lock, ip); | 3325 | return print_unlock_imbalance_bug(curr, lock, ip); |
| 3321 | 3326 | ||
| 3322 | found_it: | 3327 | found_it: |
| 3323 | lockdep_init_map(lock, name, key, 0); | 3328 | lockdep_init_map(lock, name, key, 0); |
| @@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr, | |||
| 3384 | goto found_it; | 3389 | goto found_it; |
| 3385 | prev_hlock = hlock; | 3390 | prev_hlock = hlock; |
| 3386 | } | 3391 | } |
| 3387 | return print_unlock_inbalance_bug(curr, lock, ip); | 3392 | return print_unlock_imbalance_bug(curr, lock, ip); |
| 3388 | 3393 | ||
| 3389 | found_it: | 3394 | found_it: |
| 3390 | if (hlock->instance == lock) | 3395 | if (hlock->instance == lock) |
| @@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | |||
| 4083 | } | 4088 | } |
| 4084 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); | 4089 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); |
| 4085 | 4090 | ||
| 4086 | static void print_held_locks_bug(struct task_struct *curr) | 4091 | static void print_held_locks_bug(void) |
| 4087 | { | 4092 | { |
| 4088 | if (!debug_locks_off()) | 4093 | if (!debug_locks_off()) |
| 4089 | return; | 4094 | return; |
| @@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr) | |||
| 4092 | 4097 | ||
| 4093 | printk("\n"); | 4098 | printk("\n"); |
| 4094 | printk("=====================================\n"); | 4099 | printk("=====================================\n"); |
| 4095 | printk("[ BUG: lock held at task exit time! ]\n"); | 4100 | printk("[ BUG: %s/%d still has locks held! ]\n", |
| 4101 | current->comm, task_pid_nr(current)); | ||
| 4096 | print_kernel_ident(); | 4102 | print_kernel_ident(); |
| 4097 | printk("-------------------------------------\n"); | 4103 | printk("-------------------------------------\n"); |
| 4098 | printk("%s/%d is exiting with locks still held!\n", | 4104 | lockdep_print_held_locks(current); |
| 4099 | curr->comm, task_pid_nr(curr)); | ||
| 4100 | lockdep_print_held_locks(curr); | ||
| 4101 | |||
| 4102 | printk("\nstack backtrace:\n"); | 4105 | printk("\nstack backtrace:\n"); |
| 4103 | dump_stack(); | 4106 | dump_stack(); |
| 4104 | } | 4107 | } |
| 4105 | 4108 | ||
| 4106 | void debug_check_no_locks_held(struct task_struct *task) | 4109 | void debug_check_no_locks_held(void) |
| 4107 | { | 4110 | { |
| 4108 | if (unlikely(task->lockdep_depth > 0)) | 4111 | if (unlikely(current->lockdep_depth > 0)) |
| 4109 | print_held_locks_bug(task); | 4112 | print_held_locks_bug(); |
| 4110 | } | 4113 | } |
| 4114 | EXPORT_SYMBOL_GPL(debug_check_no_locks_held); | ||
| 4111 | 4115 | ||
| 4112 | void debug_show_all_locks(void) | 4116 | void debug_show_all_locks(void) |
| 4113 | { | 4117 | { |
diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..0925c9a71975 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -188,6 +188,7 @@ struct load_info { | |||
| 188 | ongoing or failed initialization etc. */ | 188 | ongoing or failed initialization etc. */ |
| 189 | static inline int strong_try_module_get(struct module *mod) | 189 | static inline int strong_try_module_get(struct module *mod) |
| 190 | { | 190 | { |
| 191 | BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); | ||
| 191 | if (mod && mod->state == MODULE_STATE_COMING) | 192 | if (mod && mod->state == MODULE_STATE_COMING) |
| 192 | return -EBUSY; | 193 | return -EBUSY; |
| 193 | if (try_module_get(mod)) | 194 | if (try_module_get(mod)) |
| @@ -196,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod) | |||
| 196 | return -ENOENT; | 197 | return -ENOENT; |
| 197 | } | 198 | } |
| 198 | 199 | ||
| 199 | static inline void add_taint_module(struct module *mod, unsigned flag) | 200 | static inline void add_taint_module(struct module *mod, unsigned flag, |
| 201 | enum lockdep_ok lockdep_ok) | ||
| 200 | { | 202 | { |
| 201 | add_taint(flag); | 203 | add_taint(flag, lockdep_ok); |
| 202 | mod->taints |= (1U << flag); | 204 | mod->taints |= (1U << flag); |
| 203 | } | 205 | } |
| 204 | 206 | ||
| @@ -343,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, | |||
| 343 | #endif | 345 | #endif |
| 344 | }; | 346 | }; |
| 345 | 347 | ||
| 348 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 349 | continue; | ||
| 350 | |||
| 346 | if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) | 351 | if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) |
| 347 | return true; | 352 | return true; |
| 348 | } | 353 | } |
| @@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
| 450 | EXPORT_SYMBOL_GPL(find_symbol); | 455 | EXPORT_SYMBOL_GPL(find_symbol); |
| 451 | 456 | ||
| 452 | /* Search for module by name: must hold module_mutex. */ | 457 | /* Search for module by name: must hold module_mutex. */ |
| 453 | struct module *find_module(const char *name) | 458 | static struct module *find_module_all(const char *name, |
| 459 | bool even_unformed) | ||
| 454 | { | 460 | { |
| 455 | struct module *mod; | 461 | struct module *mod; |
| 456 | 462 | ||
| 457 | list_for_each_entry(mod, &modules, list) { | 463 | list_for_each_entry(mod, &modules, list) { |
| 464 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) | ||
| 465 | continue; | ||
| 458 | if (strcmp(mod->name, name) == 0) | 466 | if (strcmp(mod->name, name) == 0) |
| 459 | return mod; | 467 | return mod; |
| 460 | } | 468 | } |
| 461 | return NULL; | 469 | return NULL; |
| 462 | } | 470 | } |
| 471 | |||
| 472 | struct module *find_module(const char *name) | ||
| 473 | { | ||
| 474 | return find_module_all(name, false); | ||
| 475 | } | ||
| 463 | EXPORT_SYMBOL_GPL(find_module); | 476 | EXPORT_SYMBOL_GPL(find_module); |
| 464 | 477 | ||
| 465 | #ifdef CONFIG_SMP | 478 | #ifdef CONFIG_SMP |
| @@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr) | |||
| 525 | preempt_disable(); | 538 | preempt_disable(); |
| 526 | 539 | ||
| 527 | list_for_each_entry_rcu(mod, &modules, list) { | 540 | list_for_each_entry_rcu(mod, &modules, list) { |
| 541 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 542 | continue; | ||
| 528 | if (!mod->percpu_size) | 543 | if (!mod->percpu_size) |
| 529 | continue; | 544 | continue; |
| 530 | for_each_possible_cpu(cpu) { | 545 | for_each_possible_cpu(cpu) { |
| @@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags) | |||
| 713 | { | 728 | { |
| 714 | int ret = (flags & O_TRUNC); | 729 | int ret = (flags & O_TRUNC); |
| 715 | if (ret) | 730 | if (ret) |
| 716 | add_taint(TAINT_FORCED_RMMOD); | 731 | add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); |
| 717 | return ret; | 732 | return ret; |
| 718 | } | 733 | } |
| 719 | #else | 734 | #else |
| @@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, | |||
| 1048 | case MODULE_STATE_GOING: | 1063 | case MODULE_STATE_GOING: |
| 1049 | state = "going"; | 1064 | state = "going"; |
| 1050 | break; | 1065 | break; |
| 1066 | default: | ||
| 1067 | BUG(); | ||
| 1051 | } | 1068 | } |
| 1052 | return sprintf(buffer, "%s\n", state); | 1069 | return sprintf(buffer, "%s\n", state); |
| 1053 | } | 1070 | } |
| @@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason) | |||
| 1122 | if (!test_taint(TAINT_FORCED_MODULE)) | 1139 | if (!test_taint(TAINT_FORCED_MODULE)) |
| 1123 | printk(KERN_WARNING "%s: %s: kernel tainted.\n", | 1140 | printk(KERN_WARNING "%s: %s: kernel tainted.\n", |
| 1124 | mod->name, reason); | 1141 | mod->name, reason); |
| 1125 | add_taint_module(mod, TAINT_FORCED_MODULE); | 1142 | add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); |
| 1126 | return 0; | 1143 | return 0; |
| 1127 | #else | 1144 | #else |
| 1128 | return -ENOEXEC; | 1145 | return -ENOEXEC; |
| @@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void) | |||
| 1786 | 1803 | ||
| 1787 | mutex_lock(&module_mutex); | 1804 | mutex_lock(&module_mutex); |
| 1788 | list_for_each_entry_rcu(mod, &modules, list) { | 1805 | list_for_each_entry_rcu(mod, &modules, list) { |
| 1806 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 1807 | continue; | ||
| 1789 | if ((mod->module_core) && (mod->core_text_size)) { | 1808 | if ((mod->module_core) && (mod->core_text_size)) { |
| 1790 | set_page_attributes(mod->module_core, | 1809 | set_page_attributes(mod->module_core, |
| 1791 | mod->module_core + mod->core_text_size, | 1810 | mod->module_core + mod->core_text_size, |
| @@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void) | |||
| 1807 | 1826 | ||
| 1808 | mutex_lock(&module_mutex); | 1827 | mutex_lock(&module_mutex); |
| 1809 | list_for_each_entry_rcu(mod, &modules, list) { | 1828 | list_for_each_entry_rcu(mod, &modules, list) { |
| 1829 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 1830 | continue; | ||
| 1810 | if ((mod->module_core) && (mod->core_text_size)) { | 1831 | if ((mod->module_core) && (mod->core_text_size)) { |
| 1811 | set_page_attributes(mod->module_core, | 1832 | set_page_attributes(mod->module_core, |
| 1812 | mod->module_core + mod->core_text_size, | 1833 | mod->module_core + mod->core_text_size, |
| @@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license) | |||
| 2127 | if (!test_taint(TAINT_PROPRIETARY_MODULE)) | 2148 | if (!test_taint(TAINT_PROPRIETARY_MODULE)) |
| 2128 | printk(KERN_WARNING "%s: module license '%s' taints " | 2149 | printk(KERN_WARNING "%s: module license '%s' taints " |
| 2129 | "kernel.\n", mod->name, license); | 2150 | "kernel.\n", mod->name, license); |
| 2130 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 2151 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE, |
| 2152 | LOCKDEP_NOW_UNRELIABLE); | ||
| 2131 | } | 2153 | } |
| 2132 | } | 2154 | } |
| 2133 | 2155 | ||
| @@ -2519,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) | |||
| 2519 | if (err) | 2541 | if (err) |
| 2520 | goto out; | 2542 | goto out; |
| 2521 | 2543 | ||
| 2522 | err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); | 2544 | err = vfs_getattr(&file->f_path, &stat); |
| 2523 | if (err) | 2545 | if (err) |
| 2524 | goto out; | 2546 | goto out; |
| 2525 | 2547 | ||
| @@ -2527,6 +2549,13 @@ static int copy_module_from_fd(int fd, struct load_info *info) | |||
| 2527 | err = -EFBIG; | 2549 | err = -EFBIG; |
| 2528 | goto out; | 2550 | goto out; |
| 2529 | } | 2551 | } |
| 2552 | |||
| 2553 | /* Don't hand 0 to vmalloc, it whines. */ | ||
| 2554 | if (stat.size == 0) { | ||
| 2555 | err = -EINVAL; | ||
| 2556 | goto out; | ||
| 2557 | } | ||
| 2558 | |||
| 2530 | info->hdr = vmalloc(stat.size); | 2559 | info->hdr = vmalloc(stat.size); |
| 2531 | if (!info->hdr) { | 2560 | if (!info->hdr) { |
| 2532 | err = -ENOMEM; | 2561 | err = -ENOMEM; |
| @@ -2673,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
| 2673 | } | 2702 | } |
| 2674 | 2703 | ||
| 2675 | if (!get_modinfo(info, "intree")) | 2704 | if (!get_modinfo(info, "intree")) |
| 2676 | add_taint_module(mod, TAINT_OOT_MODULE); | 2705 | add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); |
| 2677 | 2706 | ||
| 2678 | if (get_modinfo(info, "staging")) { | 2707 | if (get_modinfo(info, "staging")) { |
| 2679 | add_taint_module(mod, TAINT_CRAP); | 2708 | add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); |
| 2680 | printk(KERN_WARNING "%s: module is from the staging directory," | 2709 | printk(KERN_WARNING "%s: module is from the staging directory," |
| 2681 | " the quality is unknown, you have been warned.\n", | 2710 | " the quality is unknown, you have been warned.\n", |
| 2682 | mod->name); | 2711 | mod->name); |
| @@ -2842,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod) | |||
| 2842 | * using GPL-only symbols it needs. | 2871 | * using GPL-only symbols it needs. |
| 2843 | */ | 2872 | */ |
| 2844 | if (strcmp(mod->name, "ndiswrapper") == 0) | 2873 | if (strcmp(mod->name, "ndiswrapper") == 0) |
| 2845 | add_taint(TAINT_PROPRIETARY_MODULE); | 2874 | add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); |
| 2846 | 2875 | ||
| 2847 | /* driverloader was caught wrongly pretending to be under GPL */ | 2876 | /* driverloader was caught wrongly pretending to be under GPL */ |
| 2848 | if (strcmp(mod->name, "driverloader") == 0) | 2877 | if (strcmp(mod->name, "driverloader") == 0) |
| 2849 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 2878 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE, |
| 2879 | LOCKDEP_NOW_UNRELIABLE); | ||
| 2850 | 2880 | ||
| 2851 | /* lve claims to be GPL but upstream won't provide source */ | 2881 | /* lve claims to be GPL but upstream won't provide source */ |
| 2852 | if (strcmp(mod->name, "lve") == 0) | 2882 | if (strcmp(mod->name, "lve") == 0) |
| 2853 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 2883 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE, |
| 2884 | LOCKDEP_NOW_UNRELIABLE); | ||
| 2854 | 2885 | ||
| 2855 | #ifdef CONFIG_MODVERSIONS | 2886 | #ifdef CONFIG_MODVERSIONS |
| 2856 | if ((mod->num_syms && !mod->crcs) | 2887 | if ((mod->num_syms && !mod->crcs) |
| @@ -2990,8 +3021,9 @@ static bool finished_loading(const char *name) | |||
| 2990 | bool ret; | 3021 | bool ret; |
| 2991 | 3022 | ||
| 2992 | mutex_lock(&module_mutex); | 3023 | mutex_lock(&module_mutex); |
| 2993 | mod = find_module(name); | 3024 | mod = find_module_all(name, true); |
| 2994 | ret = !mod || mod->state != MODULE_STATE_COMING; | 3025 | ret = !mod || mod->state == MODULE_STATE_LIVE |
| 3026 | || mod->state == MODULE_STATE_GOING; | ||
| 2995 | mutex_unlock(&module_mutex); | 3027 | mutex_unlock(&module_mutex); |
| 2996 | 3028 | ||
| 2997 | return ret; | 3029 | return ret; |
| @@ -3013,6 +3045,12 @@ static int do_init_module(struct module *mod) | |||
| 3013 | { | 3045 | { |
| 3014 | int ret = 0; | 3046 | int ret = 0; |
| 3015 | 3047 | ||
| 3048 | /* | ||
| 3049 | * We want to find out whether @mod uses async during init. Clear | ||
| 3050 | * PF_USED_ASYNC. async_schedule*() will set it. | ||
| 3051 | */ | ||
| 3052 | current->flags &= ~PF_USED_ASYNC; | ||
| 3053 | |||
| 3016 | blocking_notifier_call_chain(&module_notify_list, | 3054 | blocking_notifier_call_chain(&module_notify_list, |
| 3017 | MODULE_STATE_COMING, mod); | 3055 | MODULE_STATE_COMING, mod); |
| 3018 | 3056 | ||
| @@ -3058,8 +3096,25 @@ static int do_init_module(struct module *mod) | |||
| 3058 | blocking_notifier_call_chain(&module_notify_list, | 3096 | blocking_notifier_call_chain(&module_notify_list, |
| 3059 | MODULE_STATE_LIVE, mod); | 3097 | MODULE_STATE_LIVE, mod); |
| 3060 | 3098 | ||
| 3061 | /* We need to finish all async code before the module init sequence is done */ | 3099 | /* |
| 3062 | async_synchronize_full(); | 3100 | * We need to finish all async code before the module init sequence |
| 3101 | * is done. This has potential to deadlock. For example, a newly | ||
| 3102 | * detected block device can trigger request_module() of the | ||
| 3103 | * default iosched from async probing task. Once userland helper | ||
| 3104 | * reaches here, async_synchronize_full() will wait on the async | ||
| 3105 | * task waiting on request_module() and deadlock. | ||
| 3106 | * | ||
| 3107 | * This deadlock is avoided by perfomring async_synchronize_full() | ||
| 3108 | * iff module init queued any async jobs. This isn't a full | ||
| 3109 | * solution as it will deadlock the same if module loading from | ||
| 3110 | * async jobs nests more than once; however, due to the various | ||
| 3111 | * constraints, this hack seems to be the best option for now. | ||
| 3112 | * Please refer to the following thread for details. | ||
| 3113 | * | ||
| 3114 | * http://thread.gmane.org/gmane.linux.kernel/1420814 | ||
| 3115 | */ | ||
| 3116 | if (current->flags & PF_USED_ASYNC) | ||
| 3117 | async_synchronize_full(); | ||
| 3063 | 3118 | ||
| 3064 | mutex_lock(&module_mutex); | 3119 | mutex_lock(&module_mutex); |
| 3065 | /* Drop initial reference. */ | 3120 | /* Drop initial reference. */ |
| @@ -3090,12 +3145,72 @@ static int may_init_module(void) | |||
| 3090 | return 0; | 3145 | return 0; |
| 3091 | } | 3146 | } |
| 3092 | 3147 | ||
| 3148 | /* | ||
| 3149 | * We try to place it in the list now to make sure it's unique before | ||
| 3150 | * we dedicate too many resources. In particular, temporary percpu | ||
| 3151 | * memory exhaustion. | ||
| 3152 | */ | ||
| 3153 | static int add_unformed_module(struct module *mod) | ||
| 3154 | { | ||
| 3155 | int err; | ||
| 3156 | struct module *old; | ||
| 3157 | |||
| 3158 | mod->state = MODULE_STATE_UNFORMED; | ||
| 3159 | |||
| 3160 | again: | ||
| 3161 | mutex_lock(&module_mutex); | ||
| 3162 | if ((old = find_module_all(mod->name, true)) != NULL) { | ||
| 3163 | if (old->state == MODULE_STATE_COMING | ||
| 3164 | || old->state == MODULE_STATE_UNFORMED) { | ||
| 3165 | /* Wait in case it fails to load. */ | ||
| 3166 | mutex_unlock(&module_mutex); | ||
| 3167 | err = wait_event_interruptible(module_wq, | ||
| 3168 | finished_loading(mod->name)); | ||
| 3169 | if (err) | ||
| 3170 | goto out_unlocked; | ||
| 3171 | goto again; | ||
| 3172 | } | ||
| 3173 | err = -EEXIST; | ||
| 3174 | goto out; | ||
| 3175 | } | ||
| 3176 | list_add_rcu(&mod->list, &modules); | ||
| 3177 | err = 0; | ||
| 3178 | |||
| 3179 | out: | ||
| 3180 | mutex_unlock(&module_mutex); | ||
| 3181 | out_unlocked: | ||
| 3182 | return err; | ||
| 3183 | } | ||
| 3184 | |||
| 3185 | static int complete_formation(struct module *mod, struct load_info *info) | ||
| 3186 | { | ||
| 3187 | int err; | ||
| 3188 | |||
| 3189 | mutex_lock(&module_mutex); | ||
| 3190 | |||
| 3191 | /* Find duplicate symbols (must be called under lock). */ | ||
| 3192 | err = verify_export_symbols(mod); | ||
| 3193 | if (err < 0) | ||
| 3194 | goto out; | ||
| 3195 | |||
| 3196 | /* This relies on module_mutex for list integrity. */ | ||
| 3197 | module_bug_finalize(info->hdr, info->sechdrs, mod); | ||
| 3198 | |||
| 3199 | /* Mark state as coming so strong_try_module_get() ignores us, | ||
| 3200 | * but kallsyms etc. can see us. */ | ||
| 3201 | mod->state = MODULE_STATE_COMING; | ||
| 3202 | |||
| 3203 | out: | ||
| 3204 | mutex_unlock(&module_mutex); | ||
| 3205 | return err; | ||
| 3206 | } | ||
| 3207 | |||
| 3093 | /* Allocate and load the module: note that size of section 0 is always | 3208 | /* Allocate and load the module: note that size of section 0 is always |
| 3094 | zero, and we rely on this for optional sections. */ | 3209 | zero, and we rely on this for optional sections. */ |
| 3095 | static int load_module(struct load_info *info, const char __user *uargs, | 3210 | static int load_module(struct load_info *info, const char __user *uargs, |
| 3096 | int flags) | 3211 | int flags) |
| 3097 | { | 3212 | { |
| 3098 | struct module *mod, *old; | 3213 | struct module *mod; |
| 3099 | long err; | 3214 | long err; |
| 3100 | 3215 | ||
| 3101 | err = module_sig_check(info); | 3216 | err = module_sig_check(info); |
| @@ -3113,16 +3228,26 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3113 | goto free_copy; | 3228 | goto free_copy; |
| 3114 | } | 3229 | } |
| 3115 | 3230 | ||
| 3231 | /* Reserve our place in the list. */ | ||
| 3232 | err = add_unformed_module(mod); | ||
| 3233 | if (err) | ||
| 3234 | goto free_module; | ||
| 3235 | |||
| 3116 | #ifdef CONFIG_MODULE_SIG | 3236 | #ifdef CONFIG_MODULE_SIG |
| 3117 | mod->sig_ok = info->sig_ok; | 3237 | mod->sig_ok = info->sig_ok; |
| 3118 | if (!mod->sig_ok) | 3238 | if (!mod->sig_ok) { |
| 3119 | add_taint_module(mod, TAINT_FORCED_MODULE); | 3239 | printk_once(KERN_NOTICE |
| 3240 | "%s: module verification failed: signature and/or" | ||
| 3241 | " required key missing - tainting kernel\n", | ||
| 3242 | mod->name); | ||
| 3243 | add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); | ||
| 3244 | } | ||
| 3120 | #endif | 3245 | #endif |
| 3121 | 3246 | ||
| 3122 | /* Now module is in final location, initialize linked lists, etc. */ | 3247 | /* Now module is in final location, initialize linked lists, etc. */ |
| 3123 | err = module_unload_init(mod); | 3248 | err = module_unload_init(mod); |
| 3124 | if (err) | 3249 | if (err) |
| 3125 | goto free_module; | 3250 | goto unlink_mod; |
| 3126 | 3251 | ||
| 3127 | /* Now we've got everything in the final locations, we can | 3252 | /* Now we've got everything in the final locations, we can |
| 3128 | * find optional sections. */ | 3253 | * find optional sections. */ |
| @@ -3157,54 +3282,23 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3157 | goto free_arch_cleanup; | 3282 | goto free_arch_cleanup; |
| 3158 | } | 3283 | } |
| 3159 | 3284 | ||
| 3160 | /* Mark state as coming so strong_try_module_get() ignores us. */ | ||
| 3161 | mod->state = MODULE_STATE_COMING; | ||
| 3162 | |||
| 3163 | /* Now sew it into the lists so we can get lockdep and oops | ||
| 3164 | * info during argument parsing. No one should access us, since | ||
| 3165 | * strong_try_module_get() will fail. | ||
| 3166 | * lockdep/oops can run asynchronous, so use the RCU list insertion | ||
| 3167 | * function to insert in a way safe to concurrent readers. | ||
| 3168 | * The mutex protects against concurrent writers. | ||
| 3169 | */ | ||
| 3170 | again: | ||
| 3171 | mutex_lock(&module_mutex); | ||
| 3172 | if ((old = find_module(mod->name)) != NULL) { | ||
| 3173 | if (old->state == MODULE_STATE_COMING) { | ||
| 3174 | /* Wait in case it fails to load. */ | ||
| 3175 | mutex_unlock(&module_mutex); | ||
| 3176 | err = wait_event_interruptible(module_wq, | ||
| 3177 | finished_loading(mod->name)); | ||
| 3178 | if (err) | ||
| 3179 | goto free_arch_cleanup; | ||
| 3180 | goto again; | ||
| 3181 | } | ||
| 3182 | err = -EEXIST; | ||
| 3183 | goto unlock; | ||
| 3184 | } | ||
| 3185 | |||
| 3186 | /* This has to be done once we're sure module name is unique. */ | ||
| 3187 | dynamic_debug_setup(info->debug, info->num_debug); | 3285 | dynamic_debug_setup(info->debug, info->num_debug); |
| 3188 | 3286 | ||
| 3189 | /* Find duplicate symbols */ | 3287 | /* Finally it's fully formed, ready to start executing. */ |
| 3190 | err = verify_export_symbols(mod); | 3288 | err = complete_formation(mod, info); |
| 3191 | if (err < 0) | 3289 | if (err) |
| 3192 | goto ddebug; | 3290 | goto ddebug_cleanup; |
| 3193 | |||
| 3194 | module_bug_finalize(info->hdr, info->sechdrs, mod); | ||
| 3195 | list_add_rcu(&mod->list, &modules); | ||
| 3196 | mutex_unlock(&module_mutex); | ||
| 3197 | 3291 | ||
| 3198 | /* Module is ready to execute: parsing args may do that. */ | 3292 | /* Module is ready to execute: parsing args may do that. */ |
| 3199 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 3293 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
| 3200 | -32768, 32767, &ddebug_dyndbg_module_param_cb); | 3294 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
| 3201 | if (err < 0) | 3295 | if (err < 0) |
| 3202 | goto unlink; | 3296 | goto bug_cleanup; |
| 3203 | 3297 | ||
| 3204 | /* Link in to syfs. */ | 3298 | /* Link in to syfs. */ |
| 3205 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); | 3299 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); |
| 3206 | if (err < 0) | 3300 | if (err < 0) |
| 3207 | goto unlink; | 3301 | goto bug_cleanup; |
| 3208 | 3302 | ||
| 3209 | /* Get rid of temporary copy. */ | 3303 | /* Get rid of temporary copy. */ |
| 3210 | free_copy(info); | 3304 | free_copy(info); |
| @@ -3214,16 +3308,13 @@ again: | |||
| 3214 | 3308 | ||
| 3215 | return do_init_module(mod); | 3309 | return do_init_module(mod); |
| 3216 | 3310 | ||
| 3217 | unlink: | 3311 | bug_cleanup: |
| 3312 | /* module_bug_cleanup needs module_mutex protection */ | ||
| 3218 | mutex_lock(&module_mutex); | 3313 | mutex_lock(&module_mutex); |
| 3219 | /* Unlink carefully: kallsyms could be walking list. */ | ||
| 3220 | list_del_rcu(&mod->list); | ||
| 3221 | module_bug_cleanup(mod); | 3314 | module_bug_cleanup(mod); |
| 3222 | wake_up_all(&module_wq); | ||
| 3223 | ddebug: | ||
| 3224 | dynamic_debug_remove(info->debug); | ||
| 3225 | unlock: | ||
| 3226 | mutex_unlock(&module_mutex); | 3315 | mutex_unlock(&module_mutex); |
| 3316 | ddebug_cleanup: | ||
| 3317 | dynamic_debug_remove(info->debug); | ||
| 3227 | synchronize_sched(); | 3318 | synchronize_sched(); |
| 3228 | kfree(mod->args); | 3319 | kfree(mod->args); |
| 3229 | free_arch_cleanup: | 3320 | free_arch_cleanup: |
| @@ -3232,6 +3323,12 @@ again: | |||
| 3232 | free_modinfo(mod); | 3323 | free_modinfo(mod); |
| 3233 | free_unload: | 3324 | free_unload: |
| 3234 | module_unload_free(mod); | 3325 | module_unload_free(mod); |
| 3326 | unlink_mod: | ||
| 3327 | mutex_lock(&module_mutex); | ||
| 3328 | /* Unlink carefully: kallsyms could be walking list. */ | ||
| 3329 | list_del_rcu(&mod->list); | ||
| 3330 | wake_up_all(&module_wq); | ||
| 3331 | mutex_unlock(&module_mutex); | ||
| 3235 | free_module: | 3332 | free_module: |
| 3236 | module_deallocate(mod, info); | 3333 | module_deallocate(mod, info); |
| 3237 | free_copy: | 3334 | free_copy: |
| @@ -3354,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr, | |||
| 3354 | 3451 | ||
| 3355 | preempt_disable(); | 3452 | preempt_disable(); |
| 3356 | list_for_each_entry_rcu(mod, &modules, list) { | 3453 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3454 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3455 | continue; | ||
| 3357 | if (within_module_init(addr, mod) || | 3456 | if (within_module_init(addr, mod) || |
| 3358 | within_module_core(addr, mod)) { | 3457 | within_module_core(addr, mod)) { |
| 3359 | if (modname) | 3458 | if (modname) |
| @@ -3377,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
| 3377 | 3476 | ||
| 3378 | preempt_disable(); | 3477 | preempt_disable(); |
| 3379 | list_for_each_entry_rcu(mod, &modules, list) { | 3478 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3479 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3480 | continue; | ||
| 3380 | if (within_module_init(addr, mod) || | 3481 | if (within_module_init(addr, mod) || |
| 3381 | within_module_core(addr, mod)) { | 3482 | within_module_core(addr, mod)) { |
| 3382 | const char *sym; | 3483 | const char *sym; |
| @@ -3401,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
| 3401 | 3502 | ||
| 3402 | preempt_disable(); | 3503 | preempt_disable(); |
| 3403 | list_for_each_entry_rcu(mod, &modules, list) { | 3504 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3505 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3506 | continue; | ||
| 3404 | if (within_module_init(addr, mod) || | 3507 | if (within_module_init(addr, mod) || |
| 3405 | within_module_core(addr, mod)) { | 3508 | within_module_core(addr, mod)) { |
| 3406 | const char *sym; | 3509 | const char *sym; |
| @@ -3428,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
| 3428 | 3531 | ||
| 3429 | preempt_disable(); | 3532 | preempt_disable(); |
| 3430 | list_for_each_entry_rcu(mod, &modules, list) { | 3533 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3534 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3535 | continue; | ||
| 3431 | if (symnum < mod->num_symtab) { | 3536 | if (symnum < mod->num_symtab) { |
| 3432 | *value = mod->symtab[symnum].st_value; | 3537 | *value = mod->symtab[symnum].st_value; |
| 3433 | *type = mod->symtab[symnum].st_info; | 3538 | *type = mod->symtab[symnum].st_info; |
| @@ -3470,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
| 3470 | ret = mod_find_symname(mod, colon+1); | 3575 | ret = mod_find_symname(mod, colon+1); |
| 3471 | *colon = ':'; | 3576 | *colon = ':'; |
| 3472 | } else { | 3577 | } else { |
| 3473 | list_for_each_entry_rcu(mod, &modules, list) | 3578 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3579 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3580 | continue; | ||
| 3474 | if ((ret = mod_find_symname(mod, name)) != 0) | 3581 | if ((ret = mod_find_symname(mod, name)) != 0) |
| 3475 | break; | 3582 | break; |
| 3583 | } | ||
| 3476 | } | 3584 | } |
| 3477 | preempt_enable(); | 3585 | preempt_enable(); |
| 3478 | return ret; | 3586 | return ret; |
| @@ -3487,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, | |||
| 3487 | int ret; | 3595 | int ret; |
| 3488 | 3596 | ||
| 3489 | list_for_each_entry(mod, &modules, list) { | 3597 | list_for_each_entry(mod, &modules, list) { |
| 3598 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3599 | continue; | ||
| 3490 | for (i = 0; i < mod->num_symtab; i++) { | 3600 | for (i = 0; i < mod->num_symtab; i++) { |
| 3491 | ret = fn(data, mod->strtab + mod->symtab[i].st_name, | 3601 | ret = fn(data, mod->strtab + mod->symtab[i].st_name, |
| 3492 | mod, mod->symtab[i].st_value); | 3602 | mod, mod->symtab[i].st_value); |
| @@ -3502,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf) | |||
| 3502 | { | 3612 | { |
| 3503 | int bx = 0; | 3613 | int bx = 0; |
| 3504 | 3614 | ||
| 3615 | BUG_ON(mod->state == MODULE_STATE_UNFORMED); | ||
| 3505 | if (mod->taints || | 3616 | if (mod->taints || |
| 3506 | mod->state == MODULE_STATE_GOING || | 3617 | mod->state == MODULE_STATE_GOING || |
| 3507 | mod->state == MODULE_STATE_COMING) { | 3618 | mod->state == MODULE_STATE_COMING) { |
| @@ -3543,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p) | |||
| 3543 | struct module *mod = list_entry(p, struct module, list); | 3654 | struct module *mod = list_entry(p, struct module, list); |
| 3544 | char buf[8]; | 3655 | char buf[8]; |
| 3545 | 3656 | ||
| 3657 | /* We always ignore unformed modules. */ | ||
| 3658 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3659 | return 0; | ||
| 3660 | |||
| 3546 | seq_printf(m, "%s %u", | 3661 | seq_printf(m, "%s %u", |
| 3547 | mod->name, mod->init_size + mod->core_size); | 3662 | mod->name, mod->init_size + mod->core_size); |
| 3548 | print_unload_info(m, mod); | 3663 | print_unload_info(m, mod); |
| @@ -3603,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
| 3603 | 3718 | ||
| 3604 | preempt_disable(); | 3719 | preempt_disable(); |
| 3605 | list_for_each_entry_rcu(mod, &modules, list) { | 3720 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3721 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3722 | continue; | ||
| 3606 | if (mod->num_exentries == 0) | 3723 | if (mod->num_exentries == 0) |
| 3607 | continue; | 3724 | continue; |
| 3608 | 3725 | ||
| @@ -3651,10 +3768,13 @@ struct module *__module_address(unsigned long addr) | |||
| 3651 | if (addr < module_addr_min || addr > module_addr_max) | 3768 | if (addr < module_addr_min || addr > module_addr_max) |
| 3652 | return NULL; | 3769 | return NULL; |
| 3653 | 3770 | ||
| 3654 | list_for_each_entry_rcu(mod, &modules, list) | 3771 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3772 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3773 | continue; | ||
| 3655 | if (within_module_core(addr, mod) | 3774 | if (within_module_core(addr, mod) |
| 3656 | || within_module_init(addr, mod)) | 3775 | || within_module_init(addr, mod)) |
| 3657 | return mod; | 3776 | return mod; |
| 3777 | } | ||
| 3658 | return NULL; | 3778 | return NULL; |
| 3659 | } | 3779 | } |
| 3660 | EXPORT_SYMBOL_GPL(__module_address); | 3780 | EXPORT_SYMBOL_GPL(__module_address); |
| @@ -3707,8 +3827,11 @@ void print_modules(void) | |||
| 3707 | printk(KERN_DEFAULT "Modules linked in:"); | 3827 | printk(KERN_DEFAULT "Modules linked in:"); |
| 3708 | /* Most callers should already have preempt disabled, but make sure */ | 3828 | /* Most callers should already have preempt disabled, but make sure */ |
| 3709 | preempt_disable(); | 3829 | preempt_disable(); |
| 3710 | list_for_each_entry_rcu(mod, &modules, list) | 3830 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3831 | if (mod->state == MODULE_STATE_UNFORMED) | ||
| 3832 | continue; | ||
| 3711 | printk(" %s%s", mod->name, module_flags(mod, buf)); | 3833 | printk(" %s%s", mod->name, module_flags(mod, buf)); |
| 3834 | } | ||
| 3712 | preempt_enable(); | 3835 | preempt_enable(); |
| 3713 | if (last_unloaded_module[0]) | 3836 | if (last_unloaded_module[0]) |
| 3714 | printk(" [last unloaded: %s]", last_unloaded_module); | 3837 | printk(" [last unloaded: %s]", last_unloaded_module); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index a307cc9c9526..52f23011b6e0 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | */ | 19 | */ |
| 20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
| 21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
| 22 | #include <linux/sched/rt.h> | ||
| 22 | #include <linux/export.h> | 23 | #include <linux/export.h> |
| 23 | #include <linux/spinlock.h> | 24 | #include <linux/spinlock.h> |
| 24 | #include <linux/interrupt.h> | 25 | #include <linux/interrupt.h> |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 78e2ecb20165..afc0456f227a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
| 153 | goto out; | 153 | goto out; |
| 154 | } | 154 | } |
| 155 | 155 | ||
| 156 | new_ns = create_new_namespaces(flags, tsk, | 156 | new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); |
| 157 | task_cred_xxx(tsk, user_ns), tsk->fs); | ||
| 158 | if (IS_ERR(new_ns)) { | 157 | if (IS_ERR(new_ns)) { |
| 159 | err = PTR_ERR(new_ns); | 158 | err = PTR_ERR(new_ns); |
| 160 | goto out; | 159 | goto out; |
| @@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
| 251 | return PTR_ERR(file); | 250 | return PTR_ERR(file); |
| 252 | 251 | ||
| 253 | err = -EINVAL; | 252 | err = -EINVAL; |
| 254 | ei = PROC_I(file->f_dentry->d_inode); | 253 | ei = PROC_I(file_inode(file)); |
| 255 | ops = ei->ns_ops; | 254 | ops = ei->ns_ops; |
| 256 | if (nstype && (ops->type != nstype)) | 255 | if (nstype && (ops->type != nstype)) |
| 257 | goto out; | 256 | goto out; |
diff --git a/kernel/panic.c b/kernel/panic.c index e1b2822fff97..7c57cc9eee2c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -259,26 +259,19 @@ unsigned long get_taint(void) | |||
| 259 | return tainted_mask; | 259 | return tainted_mask; |
| 260 | } | 260 | } |
| 261 | 261 | ||
| 262 | void add_taint(unsigned flag) | 262 | /** |
| 263 | * add_taint: add a taint flag if not already set. | ||
| 264 | * @flag: one of the TAINT_* constants. | ||
| 265 | * @lockdep_ok: whether lock debugging is still OK. | ||
| 266 | * | ||
| 267 | * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for | ||
| 268 | * some notewortht-but-not-corrupting cases, it can be set to true. | ||
| 269 | */ | ||
| 270 | void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) | ||
| 263 | { | 271 | { |
| 264 | /* | 272 | if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) |
| 265 | * Can't trust the integrity of the kernel anymore. | 273 | printk(KERN_WARNING |
| 266 | * We don't call directly debug_locks_off() because the issue | 274 | "Disabling lock debugging due to kernel taint\n"); |
| 267 | * is not necessarily serious enough to set oops_in_progress to 1 | ||
| 268 | * Also we want to keep up lockdep for staging/out-of-tree | ||
| 269 | * development and post-warning case. | ||
| 270 | */ | ||
| 271 | switch (flag) { | ||
| 272 | case TAINT_CRAP: | ||
| 273 | case TAINT_OOT_MODULE: | ||
| 274 | case TAINT_WARN: | ||
| 275 | case TAINT_FIRMWARE_WORKAROUND: | ||
| 276 | break; | ||
| 277 | |||
| 278 | default: | ||
| 279 | if (__debug_locks_off()) | ||
| 280 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | ||
| 281 | } | ||
| 282 | 275 | ||
| 283 | set_bit(flag, &tainted_mask); | 276 | set_bit(flag, &tainted_mask); |
| 284 | } | 277 | } |
| @@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller, | |||
| 421 | print_modules(); | 414 | print_modules(); |
| 422 | dump_stack(); | 415 | dump_stack(); |
| 423 | print_oops_end_marker(); | 416 | print_oops_end_marker(); |
| 424 | add_taint(taint); | 417 | /* Just a warning, don't kill lockdep. */ |
| 418 | add_taint(taint, LOCKDEP_STILL_OK); | ||
| 425 | } | 419 | } |
| 426 | 420 | ||
| 427 | void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) | 421 | void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) |
diff --git a/kernel/pid.c b/kernel/pid.c index de9af600006f..047dc6264638 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -331,7 +331,7 @@ out: | |||
| 331 | return pid; | 331 | return pid; |
| 332 | 332 | ||
| 333 | out_unlock: | 333 | out_unlock: |
| 334 | spin_unlock(&pidmap_lock); | 334 | spin_unlock_irq(&pidmap_lock); |
| 335 | out_free: | 335 | out_free: |
| 336 | while (++i <= ns->level) | 336 | while (++i <= ns->level) |
| 337 | free_pidmap(pid->numbers + i); | 337 | free_pidmap(pid->numbers + i); |
| @@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns) | |||
| 350 | 350 | ||
| 351 | struct pid *find_pid_ns(int nr, struct pid_namespace *ns) | 351 | struct pid *find_pid_ns(int nr, struct pid_namespace *ns) |
| 352 | { | 352 | { |
| 353 | struct hlist_node *elem; | ||
| 354 | struct upid *pnr; | 353 | struct upid *pnr; |
| 355 | 354 | ||
| 356 | hlist_for_each_entry_rcu(pnr, elem, | 355 | hlist_for_each_entry_rcu(pnr, |
| 357 | &pid_hash[pid_hashfn(nr, ns)], pid_chain) | 356 | &pid_hash[pid_hashfn(nr, ns)], pid_chain) |
| 358 | if (pnr->nr == nr && pnr->ns == ns) | 357 | if (pnr->nr == nr && pnr->ns == ns) |
| 359 | return container_of(pnr, struct pid, | 358 | return container_of(pnr, struct pid, |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index a278cad1d5d6..8fd709c9bb58 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer, | |||
| 155 | 155 | ||
| 156 | static inline cputime_t prof_ticks(struct task_struct *p) | 156 | static inline cputime_t prof_ticks(struct task_struct *p) |
| 157 | { | 157 | { |
| 158 | return p->utime + p->stime; | 158 | cputime_t utime, stime; |
| 159 | |||
| 160 | task_cputime(p, &utime, &stime); | ||
| 161 | |||
| 162 | return utime + stime; | ||
| 159 | } | 163 | } |
| 160 | static inline cputime_t virt_ticks(struct task_struct *p) | 164 | static inline cputime_t virt_ticks(struct task_struct *p) |
| 161 | { | 165 | { |
| 162 | return p->utime; | 166 | cputime_t utime; |
| 167 | |||
| 168 | task_cputime(p, &utime, NULL); | ||
| 169 | |||
| 170 | return utime; | ||
| 163 | } | 171 | } |
| 164 | 172 | ||
| 165 | static int | 173 | static int |
| @@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head, | |||
| 471 | */ | 479 | */ |
| 472 | void posix_cpu_timers_exit(struct task_struct *tsk) | 480 | void posix_cpu_timers_exit(struct task_struct *tsk) |
| 473 | { | 481 | { |
| 482 | cputime_t utime, stime; | ||
| 483 | |||
| 474 | add_device_randomness((const void*) &tsk->se.sum_exec_runtime, | 484 | add_device_randomness((const void*) &tsk->se.sum_exec_runtime, |
| 475 | sizeof(unsigned long long)); | 485 | sizeof(unsigned long long)); |
| 486 | task_cputime(tsk, &utime, &stime); | ||
| 476 | cleanup_timers(tsk->cpu_timers, | 487 | cleanup_timers(tsk->cpu_timers, |
| 477 | tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); | 488 | utime, stime, tsk->se.sum_exec_runtime); |
| 478 | 489 | ||
| 479 | } | 490 | } |
| 480 | void posix_cpu_timers_exit_group(struct task_struct *tsk) | 491 | void posix_cpu_timers_exit_group(struct task_struct *tsk) |
| 481 | { | 492 | { |
| 482 | struct signal_struct *const sig = tsk->signal; | 493 | struct signal_struct *const sig = tsk->signal; |
| 494 | cputime_t utime, stime; | ||
| 483 | 495 | ||
| 496 | task_cputime(tsk, &utime, &stime); | ||
| 484 | cleanup_timers(tsk->signal->cpu_timers, | 497 | cleanup_timers(tsk->signal->cpu_timers, |
| 485 | tsk->utime + sig->utime, tsk->stime + sig->stime, | 498 | utime + sig->utime, stime + sig->stime, |
| 486 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); | 499 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); |
| 487 | } | 500 | } |
| 488 | 501 | ||
| @@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample, | |||
| 1226 | static inline int fastpath_timer_check(struct task_struct *tsk) | 1239 | static inline int fastpath_timer_check(struct task_struct *tsk) |
| 1227 | { | 1240 | { |
| 1228 | struct signal_struct *sig; | 1241 | struct signal_struct *sig; |
| 1242 | cputime_t utime, stime; | ||
| 1243 | |||
| 1244 | task_cputime(tsk, &utime, &stime); | ||
| 1229 | 1245 | ||
| 1230 | if (!task_cputime_zero(&tsk->cputime_expires)) { | 1246 | if (!task_cputime_zero(&tsk->cputime_expires)) { |
| 1231 | struct task_cputime task_sample = { | 1247 | struct task_cputime task_sample = { |
| 1232 | .utime = tsk->utime, | 1248 | .utime = utime, |
| 1233 | .stime = tsk->stime, | 1249 | .stime = stime, |
| 1234 | .sum_exec_runtime = tsk->se.sum_exec_runtime | 1250 | .sum_exec_runtime = tsk->se.sum_exec_runtime |
| 1235 | }; | 1251 | }; |
| 1236 | 1252 | ||
| @@ -1401,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1401 | while (!signal_pending(current)) { | 1417 | while (!signal_pending(current)) { |
| 1402 | if (timer.it.cpu.expires.sched == 0) { | 1418 | if (timer.it.cpu.expires.sched == 0) { |
| 1403 | /* | 1419 | /* |
| 1404 | * Our timer fired and was reset. | 1420 | * Our timer fired and was reset, below |
| 1421 | * deletion can not fail. | ||
| 1405 | */ | 1422 | */ |
| 1423 | posix_cpu_timer_del(&timer); | ||
| 1406 | spin_unlock_irq(&timer.it_lock); | 1424 | spin_unlock_irq(&timer.it_lock); |
| 1407 | return 0; | 1425 | return 0; |
| 1408 | } | 1426 | } |
| @@ -1420,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1420 | * We were interrupted by a signal. | 1438 | * We were interrupted by a signal. |
| 1421 | */ | 1439 | */ |
| 1422 | sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); | 1440 | sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); |
| 1423 | posix_cpu_timer_set(&timer, 0, &zero_it, it); | 1441 | error = posix_cpu_timer_set(&timer, 0, &zero_it, it); |
| 1442 | if (!error) { | ||
| 1443 | /* | ||
| 1444 | * Timer is now unarmed, deletion can not fail. | ||
| 1445 | */ | ||
| 1446 | posix_cpu_timer_del(&timer); | ||
| 1447 | } | ||
| 1424 | spin_unlock_irq(&timer.it_lock); | 1448 | spin_unlock_irq(&timer.it_lock); |
| 1425 | 1449 | ||
| 1450 | while (error == TIMER_RETRY) { | ||
| 1451 | /* | ||
| 1452 | * We need to handle case when timer was or is in the | ||
| 1453 | * middle of firing. In other cases we already freed | ||
| 1454 | * resources. | ||
| 1455 | */ | ||
| 1456 | spin_lock_irq(&timer.it_lock); | ||
| 1457 | error = posix_cpu_timer_del(&timer); | ||
| 1458 | spin_unlock_irq(&timer.it_lock); | ||
| 1459 | } | ||
| 1460 | |||
| 1426 | if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { | 1461 | if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { |
| 1427 | /* | 1462 | /* |
| 1428 | * It actually did fire already. | 1463 | * It actually did fire already. |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 69185ae6b701..6edbb2c55c22 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 552 | return -EAGAIN; | 552 | return -EAGAIN; |
| 553 | 553 | ||
| 554 | spin_lock_init(&new_timer->it_lock); | 554 | spin_lock_init(&new_timer->it_lock); |
| 555 | retry: | 555 | |
| 556 | if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { | 556 | idr_preload(GFP_KERNEL); |
| 557 | error = -EAGAIN; | ||
| 558 | goto out; | ||
| 559 | } | ||
| 560 | spin_lock_irq(&idr_lock); | 557 | spin_lock_irq(&idr_lock); |
| 561 | error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); | 558 | error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); |
| 562 | spin_unlock_irq(&idr_lock); | 559 | spin_unlock_irq(&idr_lock); |
| 563 | if (error) { | 560 | idr_preload_end(); |
| 564 | if (error == -EAGAIN) | 561 | if (error < 0) { |
| 565 | goto retry; | ||
| 566 | /* | 562 | /* |
| 567 | * Weird looking, but we return EAGAIN if the IDR is | 563 | * Weird looking, but we return EAGAIN if the IDR is |
| 568 | * full (proper POSIX return value for this) | 564 | * full (proper POSIX return value for this) |
| 569 | */ | 565 | */ |
| 570 | error = -EAGAIN; | 566 | if (error == -ENOSPC) |
| 567 | error = -EAGAIN; | ||
| 571 | goto out; | 568 | goto out; |
| 572 | } | 569 | } |
| 570 | new_timer_id = error; | ||
| 573 | 571 | ||
| 574 | it_id_set = IT_ID_SET; | 572 | it_id_set = IT_ID_SET; |
| 575 | new_timer->it_id = (timer_t) new_timer_id; | 573 | new_timer->it_id = (timer_t) new_timer_id; |
| @@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | |||
| 639 | { | 637 | { |
| 640 | struct k_itimer *timr; | 638 | struct k_itimer *timr; |
| 641 | 639 | ||
| 640 | /* | ||
| 641 | * timer_t could be any type >= int and we want to make sure any | ||
| 642 | * @timer_id outside positive int range fails lookup. | ||
| 643 | */ | ||
| 644 | if ((unsigned long long)timer_id > INT_MAX) | ||
| 645 | return NULL; | ||
| 646 | |||
| 642 | rcu_read_lock(); | 647 | rcu_read_lock(); |
| 643 | timr = idr_find(&posix_timers_id, (int)timer_id); | 648 | timr = idr_find(&posix_timers_id, (int)timer_id); |
| 644 | if (timr) { | 649 | if (timr) { |
| @@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, | |||
| 997 | 1002 | ||
| 998 | err = kc->clock_adj(which_clock, &ktx); | 1003 | err = kc->clock_adj(which_clock, &ktx); |
| 999 | 1004 | ||
| 1000 | if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) | 1005 | if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) |
| 1001 | return -EFAULT; | 1006 | return -EFAULT; |
| 1002 | 1007 | ||
| 1003 | return err; | 1008 | return err; |
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index ca304046d9e2..c6422ffeda9a 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c | |||
| @@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend); | |||
| 66 | 66 | ||
| 67 | void queue_up_suspend_work(void) | 67 | void queue_up_suspend_work(void) |
| 68 | { | 68 | { |
| 69 | if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) | 69 | if (autosleep_state > PM_SUSPEND_ON) |
| 70 | queue_work(autosleep_wq, &suspend_work); | 70 | queue_work(autosleep_wq, &suspend_work); |
| 71 | } | 71 | } |
| 72 | 72 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c16f9167de1..d77663bfedeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 313 | static suspend_state_t decode_state(const char *buf, size_t n) | 313 | static suspend_state_t decode_state(const char *buf, size_t n) |
| 314 | { | 314 | { |
| 315 | #ifdef CONFIG_SUSPEND | 315 | #ifdef CONFIG_SUSPEND |
| 316 | suspend_state_t state = PM_SUSPEND_STANDBY; | 316 | suspend_state_t state = PM_SUSPEND_MIN; |
| 317 | const char * const *s; | 317 | const char * const *s; |
| 318 | #endif | 318 | #endif |
| 319 | char *p; | 319 | char *p; |
| @@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match); | |||
| 553 | 553 | ||
| 554 | #endif /* CONFIG_PM_TRACE */ | 554 | #endif /* CONFIG_PM_TRACE */ |
| 555 | 555 | ||
| 556 | #ifdef CONFIG_FREEZER | ||
| 557 | static ssize_t pm_freeze_timeout_show(struct kobject *kobj, | ||
| 558 | struct kobj_attribute *attr, char *buf) | ||
| 559 | { | ||
| 560 | return sprintf(buf, "%u\n", freeze_timeout_msecs); | ||
| 561 | } | ||
| 562 | |||
| 563 | static ssize_t pm_freeze_timeout_store(struct kobject *kobj, | ||
| 564 | struct kobj_attribute *attr, | ||
| 565 | const char *buf, size_t n) | ||
| 566 | { | ||
| 567 | unsigned long val; | ||
| 568 | |||
| 569 | if (kstrtoul(buf, 10, &val)) | ||
| 570 | return -EINVAL; | ||
| 571 | |||
| 572 | freeze_timeout_msecs = val; | ||
| 573 | return n; | ||
| 574 | } | ||
| 575 | |||
| 576 | power_attr(pm_freeze_timeout); | ||
| 577 | |||
| 578 | #endif /* CONFIG_FREEZER*/ | ||
| 579 | |||
| 556 | static struct attribute * g[] = { | 580 | static struct attribute * g[] = { |
| 557 | &state_attr.attr, | 581 | &state_attr.attr, |
| 558 | #ifdef CONFIG_PM_TRACE | 582 | #ifdef CONFIG_PM_TRACE |
| @@ -576,6 +600,9 @@ static struct attribute * g[] = { | |||
| 576 | &pm_print_times_attr.attr, | 600 | &pm_print_times_attr.attr, |
| 577 | #endif | 601 | #endif |
| 578 | #endif | 602 | #endif |
| 603 | #ifdef CONFIG_FREEZER | ||
| 604 | &pm_freeze_timeout_attr.attr, | ||
| 605 | #endif | ||
| 579 | NULL, | 606 | NULL, |
| 580 | }; | 607 | }; |
| 581 | 608 | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b60c6f..98088e0e71e8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -21,7 +21,7 @@ | |||
| 21 | /* | 21 | /* |
| 22 | * Timeout for stopping processes | 22 | * Timeout for stopping processes |
| 23 | */ | 23 | */ |
| 24 | #define TIMEOUT (20 * HZ) | 24 | unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; |
| 25 | 25 | ||
| 26 | static int try_to_freeze_tasks(bool user_only) | 26 | static int try_to_freeze_tasks(bool user_only) |
| 27 | { | 27 | { |
| @@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 36 | 36 | ||
| 37 | do_gettimeofday(&start); | 37 | do_gettimeofday(&start); |
| 38 | 38 | ||
| 39 | end_time = jiffies + TIMEOUT; | 39 | end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); |
| 40 | 40 | ||
| 41 | if (!user_only) | 41 | if (!user_only) |
| 42 | freeze_workqueues_begin(); | 42 | freeze_workqueues_begin(); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9322ff7eaad6..587dddeebf15 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
| 359 | return; | 359 | return; |
| 360 | } | 360 | } |
| 361 | 361 | ||
| 362 | if (delayed_work_pending(&req->work)) | 362 | cancel_delayed_work_sync(&req->work); |
| 363 | cancel_delayed_work_sync(&req->work); | ||
| 364 | 363 | ||
| 365 | if (new_value != req->node.prio) | 364 | if (new_value != req->node.prio) |
| 366 | pm_qos_update_target( | 365 | pm_qos_update_target( |
| @@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, | |||
| 386 | "%s called for unknown object.", __func__)) | 385 | "%s called for unknown object.", __func__)) |
| 387 | return; | 386 | return; |
| 388 | 387 | ||
| 389 | if (delayed_work_pending(&req->work)) | 388 | cancel_delayed_work_sync(&req->work); |
| 390 | cancel_delayed_work_sync(&req->work); | ||
| 391 | 389 | ||
| 392 | if (new_value != req->node.prio) | 390 | if (new_value != req->node.prio) |
| 393 | pm_qos_update_target( | 391 | pm_qos_update_target( |
| @@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) | |||
| 416 | return; | 414 | return; |
| 417 | } | 415 | } |
| 418 | 416 | ||
| 419 | if (delayed_work_pending(&req->work)) | 417 | cancel_delayed_work_sync(&req->work); |
| 420 | cancel_delayed_work_sync(&req->work); | ||
| 421 | 418 | ||
| 422 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, | 419 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, |
| 423 | &req->node, PM_QOS_REMOVE_REQ, | 420 | &req->node, PM_QOS_REMOVE_REQ, |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..d4feda084a3a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -30,12 +30,38 @@ | |||
| 30 | #include "power.h" | 30 | #include "power.h" |
| 31 | 31 | ||
| 32 | const char *const pm_states[PM_SUSPEND_MAX] = { | 32 | const char *const pm_states[PM_SUSPEND_MAX] = { |
| 33 | [PM_SUSPEND_FREEZE] = "freeze", | ||
| 33 | [PM_SUSPEND_STANDBY] = "standby", | 34 | [PM_SUSPEND_STANDBY] = "standby", |
| 34 | [PM_SUSPEND_MEM] = "mem", | 35 | [PM_SUSPEND_MEM] = "mem", |
| 35 | }; | 36 | }; |
| 36 | 37 | ||
| 37 | static const struct platform_suspend_ops *suspend_ops; | 38 | static const struct platform_suspend_ops *suspend_ops; |
| 38 | 39 | ||
| 40 | static bool need_suspend_ops(suspend_state_t state) | ||
| 41 | { | ||
| 42 | return !!(state > PM_SUSPEND_FREEZE); | ||
| 43 | } | ||
| 44 | |||
| 45 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); | ||
| 46 | static bool suspend_freeze_wake; | ||
| 47 | |||
| 48 | static void freeze_begin(void) | ||
| 49 | { | ||
| 50 | suspend_freeze_wake = false; | ||
| 51 | } | ||
| 52 | |||
| 53 | static void freeze_enter(void) | ||
| 54 | { | ||
| 55 | wait_event(suspend_freeze_wait_head, suspend_freeze_wake); | ||
| 56 | } | ||
| 57 | |||
| 58 | void freeze_wake(void) | ||
| 59 | { | ||
| 60 | suspend_freeze_wake = true; | ||
| 61 | wake_up(&suspend_freeze_wait_head); | ||
| 62 | } | ||
| 63 | EXPORT_SYMBOL_GPL(freeze_wake); | ||
| 64 | |||
| 39 | /** | 65 | /** |
| 40 | * suspend_set_ops - Set the global suspend method table. | 66 | * suspend_set_ops - Set the global suspend method table. |
| 41 | * @ops: Suspend operations to use. | 67 | * @ops: Suspend operations to use. |
| @@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); | |||
| 50 | 76 | ||
| 51 | bool valid_state(suspend_state_t state) | 77 | bool valid_state(suspend_state_t state) |
| 52 | { | 78 | { |
| 79 | if (state == PM_SUSPEND_FREEZE) | ||
| 80 | return true; | ||
| 53 | /* | 81 | /* |
| 54 | * All states need lowlevel support and need to be valid to the lowlevel | 82 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel |
| 83 | * support and need to be valid to the lowlevel | ||
| 55 | * implementation, no valid callback implies that none are valid. | 84 | * implementation, no valid callback implies that none are valid. |
| 56 | */ | 85 | */ |
| 57 | return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); | 86 | return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); |
| @@ -89,11 +118,11 @@ static int suspend_test(int level) | |||
| 89 | * hibernation). Run suspend notifiers, allocate the "suspend" console and | 118 | * hibernation). Run suspend notifiers, allocate the "suspend" console and |
| 90 | * freeze processes. | 119 | * freeze processes. |
| 91 | */ | 120 | */ |
| 92 | static int suspend_prepare(void) | 121 | static int suspend_prepare(suspend_state_t state) |
| 93 | { | 122 | { |
| 94 | int error; | 123 | int error; |
| 95 | 124 | ||
| 96 | if (!suspend_ops || !suspend_ops->enter) | 125 | if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) |
| 97 | return -EPERM; | 126 | return -EPERM; |
| 98 | 127 | ||
| 99 | pm_prepare_console(); | 128 | pm_prepare_console(); |
| @@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 137 | { | 166 | { |
| 138 | int error; | 167 | int error; |
| 139 | 168 | ||
| 140 | if (suspend_ops->prepare) { | 169 | if (need_suspend_ops(state) && suspend_ops->prepare) { |
| 141 | error = suspend_ops->prepare(); | 170 | error = suspend_ops->prepare(); |
| 142 | if (error) | 171 | if (error) |
| 143 | goto Platform_finish; | 172 | goto Platform_finish; |
| @@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 149 | goto Platform_finish; | 178 | goto Platform_finish; |
| 150 | } | 179 | } |
| 151 | 180 | ||
| 152 | if (suspend_ops->prepare_late) { | 181 | if (need_suspend_ops(state) && suspend_ops->prepare_late) { |
| 153 | error = suspend_ops->prepare_late(); | 182 | error = suspend_ops->prepare_late(); |
| 154 | if (error) | 183 | if (error) |
| 155 | goto Platform_wake; | 184 | goto Platform_wake; |
| 156 | } | 185 | } |
| 157 | 186 | ||
| 187 | /* | ||
| 188 | * PM_SUSPEND_FREEZE equals | ||
| 189 | * frozen processes + suspended devices + idle processors. | ||
| 190 | * Thus we should invoke freeze_enter() soon after | ||
| 191 | * all the devices are suspended. | ||
| 192 | */ | ||
| 193 | if (state == PM_SUSPEND_FREEZE) { | ||
| 194 | freeze_enter(); | ||
| 195 | goto Platform_wake; | ||
| 196 | } | ||
| 197 | |||
| 158 | if (suspend_test(TEST_PLATFORM)) | 198 | if (suspend_test(TEST_PLATFORM)) |
| 159 | goto Platform_wake; | 199 | goto Platform_wake; |
| 160 | 200 | ||
| @@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 182 | enable_nonboot_cpus(); | 222 | enable_nonboot_cpus(); |
| 183 | 223 | ||
| 184 | Platform_wake: | 224 | Platform_wake: |
| 185 | if (suspend_ops->wake) | 225 | if (need_suspend_ops(state) && suspend_ops->wake) |
| 186 | suspend_ops->wake(); | 226 | suspend_ops->wake(); |
| 187 | 227 | ||
| 188 | dpm_resume_start(PMSG_RESUME); | 228 | dpm_resume_start(PMSG_RESUME); |
| 189 | 229 | ||
| 190 | Platform_finish: | 230 | Platform_finish: |
| 191 | if (suspend_ops->finish) | 231 | if (need_suspend_ops(state) && suspend_ops->finish) |
| 192 | suspend_ops->finish(); | 232 | suspend_ops->finish(); |
| 193 | 233 | ||
| 194 | return error; | 234 | return error; |
| @@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 203 | int error; | 243 | int error; |
| 204 | bool wakeup = false; | 244 | bool wakeup = false; |
| 205 | 245 | ||
| 206 | if (!suspend_ops) | 246 | if (need_suspend_ops(state) && !suspend_ops) |
| 207 | return -ENOSYS; | 247 | return -ENOSYS; |
| 208 | 248 | ||
| 209 | trace_machine_suspend(state); | 249 | trace_machine_suspend(state); |
| 210 | if (suspend_ops->begin) { | 250 | if (need_suspend_ops(state) && suspend_ops->begin) { |
| 211 | error = suspend_ops->begin(state); | 251 | error = suspend_ops->begin(state); |
| 212 | if (error) | 252 | if (error) |
| 213 | goto Close; | 253 | goto Close; |
| @@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 226 | 266 | ||
| 227 | do { | 267 | do { |
| 228 | error = suspend_enter(state, &wakeup); | 268 | error = suspend_enter(state, &wakeup); |
| 229 | } while (!error && !wakeup | 269 | } while (!error && !wakeup && need_suspend_ops(state) |
| 230 | && suspend_ops->suspend_again && suspend_ops->suspend_again()); | 270 | && suspend_ops->suspend_again && suspend_ops->suspend_again()); |
| 231 | 271 | ||
| 232 | Resume_devices: | 272 | Resume_devices: |
| @@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 236 | ftrace_start(); | 276 | ftrace_start(); |
| 237 | resume_console(); | 277 | resume_console(); |
| 238 | Close: | 278 | Close: |
| 239 | if (suspend_ops->end) | 279 | if (need_suspend_ops(state) && suspend_ops->end) |
| 240 | suspend_ops->end(); | 280 | suspend_ops->end(); |
| 241 | trace_machine_suspend(PWR_EVENT_EXIT); | 281 | trace_machine_suspend(PWR_EVENT_EXIT); |
| 242 | return error; | 282 | return error; |
| 243 | 283 | ||
| 244 | Recover_platform: | 284 | Recover_platform: |
| 245 | if (suspend_ops->recover) | 285 | if (need_suspend_ops(state) && suspend_ops->recover) |
| 246 | suspend_ops->recover(); | 286 | suspend_ops->recover(); |
| 247 | goto Resume_devices; | 287 | goto Resume_devices; |
| 248 | } | 288 | } |
| @@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state) | |||
| 278 | if (!mutex_trylock(&pm_mutex)) | 318 | if (!mutex_trylock(&pm_mutex)) |
| 279 | return -EBUSY; | 319 | return -EBUSY; |
| 280 | 320 | ||
| 321 | if (state == PM_SUSPEND_FREEZE) | ||
| 322 | freeze_begin(); | ||
| 323 | |||
| 281 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 324 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
| 282 | sys_sync(); | 325 | sys_sync(); |
| 283 | printk("done.\n"); | 326 | printk("done.\n"); |
| 284 | 327 | ||
| 285 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 328 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
| 286 | error = suspend_prepare(); | 329 | error = suspend_prepare(state); |
| 287 | if (error) | 330 | if (error) |
| 288 | goto Unlock; | 331 | goto Unlock; |
| 289 | 332 | ||
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 25596e450ac7..9b2a1d58558d 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
| @@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
| 112 | rtc_set_alarm(rtc, &alm); | 112 | rtc_set_alarm(rtc, &alm); |
| 113 | } | 113 | } |
| 114 | 114 | ||
| 115 | static int __init has_wakealarm(struct device *dev, void *name_ptr) | 115 | static int __init has_wakealarm(struct device *dev, const void *data) |
| 116 | { | 116 | { |
| 117 | struct rtc_device *candidate = to_rtc_device(dev); | 117 | struct rtc_device *candidate = to_rtc_device(dev); |
| 118 | 118 | ||
| @@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) | |||
| 121 | if (!device_may_wakeup(candidate->dev.parent)) | 121 | if (!device_may_wakeup(candidate->dev.parent)) |
| 122 | return 0; | 122 | return 0; |
| 123 | 123 | ||
| 124 | *(const char **)name_ptr = dev_name(dev); | ||
| 125 | return 1; | 124 | return 1; |
| 126 | } | 125 | } |
| 127 | 126 | ||
| @@ -159,8 +158,8 @@ static int __init test_suspend(void) | |||
| 159 | static char warn_no_rtc[] __initdata = | 158 | static char warn_no_rtc[] __initdata = |
| 160 | KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; | 159 | KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; |
| 161 | 160 | ||
| 162 | char *pony = NULL; | ||
| 163 | struct rtc_device *rtc = NULL; | 161 | struct rtc_device *rtc = NULL; |
| 162 | struct device *dev; | ||
| 164 | 163 | ||
| 165 | /* PM is initialized by now; is that state testable? */ | 164 | /* PM is initialized by now; is that state testable? */ |
| 166 | if (test_state == PM_SUSPEND_ON) | 165 | if (test_state == PM_SUSPEND_ON) |
| @@ -171,9 +170,9 @@ static int __init test_suspend(void) | |||
| 171 | } | 170 | } |
| 172 | 171 | ||
| 173 | /* RTCs have initialized by now too ... can we use one? */ | 172 | /* RTCs have initialized by now too ... can we use one? */ |
| 174 | class_find_device(rtc_class, NULL, &pony, has_wakealarm); | 173 | dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); |
| 175 | if (pony) | 174 | if (dev) |
| 176 | rtc = rtc_class_open(pony); | 175 | rtc = rtc_class_open(dev_name(dev)); |
| 177 | if (!rtc) { | 176 | if (!rtc) { |
| 178 | printk(warn_no_rtc); | 177 | printk(warn_no_rtc); |
| 179 | goto done; | 178 | goto done; |
diff --git a/kernel/printk.c b/kernel/printk.c index 357f714ddd49..0b31715f335a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -42,6 +42,7 @@ | |||
| 42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
| 43 | #include <linux/rculist.h> | 43 | #include <linux/rculist.h> |
| 44 | #include <linux/poll.h> | 44 | #include <linux/poll.h> |
| 45 | #include <linux/irq_work.h> | ||
| 45 | 46 | ||
| 46 | #include <asm/uaccess.h> | 47 | #include <asm/uaccess.h> |
| 47 | 48 | ||
| @@ -1967,30 +1968,32 @@ int is_console_locked(void) | |||
| 1967 | static DEFINE_PER_CPU(int, printk_pending); | 1968 | static DEFINE_PER_CPU(int, printk_pending); |
| 1968 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | 1969 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); |
| 1969 | 1970 | ||
| 1970 | void printk_tick(void) | 1971 | static void wake_up_klogd_work_func(struct irq_work *irq_work) |
| 1971 | { | 1972 | { |
| 1972 | if (__this_cpu_read(printk_pending)) { | 1973 | int pending = __this_cpu_xchg(printk_pending, 0); |
| 1973 | int pending = __this_cpu_xchg(printk_pending, 0); | 1974 | |
| 1974 | if (pending & PRINTK_PENDING_SCHED) { | 1975 | if (pending & PRINTK_PENDING_SCHED) { |
| 1975 | char *buf = __get_cpu_var(printk_sched_buf); | 1976 | char *buf = __get_cpu_var(printk_sched_buf); |
| 1976 | printk(KERN_WARNING "[sched_delayed] %s", buf); | 1977 | printk(KERN_WARNING "[sched_delayed] %s", buf); |
| 1977 | } | ||
| 1978 | if (pending & PRINTK_PENDING_WAKEUP) | ||
| 1979 | wake_up_interruptible(&log_wait); | ||
| 1980 | } | 1978 | } |
| 1981 | } | ||
| 1982 | 1979 | ||
| 1983 | int printk_needs_cpu(int cpu) | 1980 | if (pending & PRINTK_PENDING_WAKEUP) |
| 1984 | { | 1981 | wake_up_interruptible(&log_wait); |
| 1985 | if (cpu_is_offline(cpu)) | ||
| 1986 | printk_tick(); | ||
| 1987 | return __this_cpu_read(printk_pending); | ||
| 1988 | } | 1982 | } |
| 1989 | 1983 | ||
| 1984 | static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { | ||
| 1985 | .func = wake_up_klogd_work_func, | ||
| 1986 | .flags = IRQ_WORK_LAZY, | ||
| 1987 | }; | ||
| 1988 | |||
| 1990 | void wake_up_klogd(void) | 1989 | void wake_up_klogd(void) |
| 1991 | { | 1990 | { |
| 1992 | if (waitqueue_active(&log_wait)) | 1991 | preempt_disable(); |
| 1992 | if (waitqueue_active(&log_wait)) { | ||
| 1993 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1993 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
| 1994 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | ||
| 1995 | } | ||
| 1996 | preempt_enable(); | ||
| 1994 | } | 1997 | } |
| 1995 | 1998 | ||
| 1996 | static void console_cont_flush(char *text, size_t size) | 1999 | static void console_cont_flush(char *text, size_t size) |
| @@ -2471,6 +2474,7 @@ int printk_sched(const char *fmt, ...) | |||
| 2471 | va_end(args); | 2474 | va_end(args); |
| 2472 | 2475 | ||
| 2473 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); | 2476 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); |
| 2477 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | ||
| 2474 | local_irq_restore(flags); | 2478 | local_irq_restore(flags); |
| 2475 | 2479 | ||
| 2476 | return r; | 2480 | return r; |
diff --git a/kernel/profile.c b/kernel/profile.c index 1f391819c42f..dc3384ee874e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -37,9 +37,6 @@ struct profile_hit { | |||
| 37 | #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) | 37 | #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) |
| 38 | #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) | 38 | #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) |
| 39 | 39 | ||
| 40 | /* Oprofile timer tick hook */ | ||
| 41 | static int (*timer_hook)(struct pt_regs *) __read_mostly; | ||
| 42 | |||
| 43 | static atomic_t *prof_buffer; | 40 | static atomic_t *prof_buffer; |
| 44 | static unsigned long prof_len, prof_shift; | 41 | static unsigned long prof_len, prof_shift; |
| 45 | 42 | ||
| @@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) | |||
| 208 | } | 205 | } |
| 209 | EXPORT_SYMBOL_GPL(profile_event_unregister); | 206 | EXPORT_SYMBOL_GPL(profile_event_unregister); |
| 210 | 207 | ||
| 211 | int register_timer_hook(int (*hook)(struct pt_regs *)) | ||
| 212 | { | ||
| 213 | if (timer_hook) | ||
| 214 | return -EBUSY; | ||
| 215 | timer_hook = hook; | ||
| 216 | return 0; | ||
| 217 | } | ||
| 218 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
| 219 | |||
| 220 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | ||
| 221 | { | ||
| 222 | WARN_ON(hook != timer_hook); | ||
| 223 | timer_hook = NULL; | ||
| 224 | /* make sure all CPUs see the NULL hook */ | ||
| 225 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ | ||
| 226 | } | ||
| 227 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | ||
| 228 | |||
| 229 | |||
| 230 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
| 231 | /* | 209 | /* |
| 232 | * Each cpu has a pair of open-addressed hashtables for pending | 210 | * Each cpu has a pair of open-addressed hashtables for pending |
| @@ -436,8 +414,6 @@ void profile_tick(int type) | |||
| 436 | { | 414 | { |
| 437 | struct pt_regs *regs = get_irq_regs(); | 415 | struct pt_regs *regs = get_irq_regs(); |
| 438 | 416 | ||
| 439 | if (type == CPU_PROFILING && timer_hook) | ||
| 440 | timer_hook(regs); | ||
| 441 | if (!user_mode(regs) && prof_cpu_mask != NULL && | 417 | if (!user_mode(regs) && prof_cpu_mask != NULL && |
| 442 | cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) | 418 | cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) |
| 443 | profile_hit(type, (void *)profile_pc(regs)); | 419 | profile_hit(type, (void *)profile_pc(regs)); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1599157336a6..acbd28424d81 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child) | |||
| 117 | * TASK_KILLABLE sleeps. | 117 | * TASK_KILLABLE sleeps. |
| 118 | */ | 118 | */ |
| 119 | if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) | 119 | if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) |
| 120 | signal_wake_up(child, task_is_traced(child)); | 120 | ptrace_signal_wake_up(child, true); |
| 121 | 121 | ||
| 122 | spin_unlock(&child->sighand->siglock); | 122 | spin_unlock(&child->sighand->siglock); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | /* Ensure that nothing can wake it up, even SIGKILL */ | ||
| 126 | static bool ptrace_freeze_traced(struct task_struct *task) | ||
| 127 | { | ||
| 128 | bool ret = false; | ||
| 129 | |||
| 130 | /* Lockless, nobody but us can set this flag */ | ||
| 131 | if (task->jobctl & JOBCTL_LISTENING) | ||
| 132 | return ret; | ||
| 133 | |||
| 134 | spin_lock_irq(&task->sighand->siglock); | ||
| 135 | if (task_is_traced(task) && !__fatal_signal_pending(task)) { | ||
| 136 | task->state = __TASK_TRACED; | ||
| 137 | ret = true; | ||
| 138 | } | ||
| 139 | spin_unlock_irq(&task->sighand->siglock); | ||
| 140 | |||
| 141 | return ret; | ||
| 142 | } | ||
| 143 | |||
| 144 | static void ptrace_unfreeze_traced(struct task_struct *task) | ||
| 145 | { | ||
| 146 | if (task->state != __TASK_TRACED) | ||
| 147 | return; | ||
| 148 | |||
| 149 | WARN_ON(!task->ptrace || task->parent != current); | ||
| 150 | |||
| 151 | spin_lock_irq(&task->sighand->siglock); | ||
| 152 | if (__fatal_signal_pending(task)) | ||
| 153 | wake_up_state(task, __TASK_TRACED); | ||
| 154 | else | ||
| 155 | task->state = TASK_TRACED; | ||
| 156 | spin_unlock_irq(&task->sighand->siglock); | ||
| 157 | } | ||
| 158 | |||
| 125 | /** | 159 | /** |
| 126 | * ptrace_check_attach - check whether ptracee is ready for ptrace operation | 160 | * ptrace_check_attach - check whether ptracee is ready for ptrace operation |
| 127 | * @child: ptracee to check for | 161 | * @child: ptracee to check for |
| @@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child) | |||
| 139 | * RETURNS: | 173 | * RETURNS: |
| 140 | * 0 on success, -ESRCH if %child is not ready. | 174 | * 0 on success, -ESRCH if %child is not ready. |
| 141 | */ | 175 | */ |
| 142 | int ptrace_check_attach(struct task_struct *child, bool ignore_state) | 176 | static int ptrace_check_attach(struct task_struct *child, bool ignore_state) |
| 143 | { | 177 | { |
| 144 | int ret = -ESRCH; | 178 | int ret = -ESRCH; |
| 145 | 179 | ||
| @@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) | |||
| 151 | * be changed by us so it's not changing right after this. | 185 | * be changed by us so it's not changing right after this. |
| 152 | */ | 186 | */ |
| 153 | read_lock(&tasklist_lock); | 187 | read_lock(&tasklist_lock); |
| 154 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { | 188 | if (child->ptrace && child->parent == current) { |
| 189 | WARN_ON(child->state == __TASK_TRACED); | ||
| 155 | /* | 190 | /* |
| 156 | * child->sighand can't be NULL, release_task() | 191 | * child->sighand can't be NULL, release_task() |
| 157 | * does ptrace_unlink() before __exit_signal(). | 192 | * does ptrace_unlink() before __exit_signal(). |
| 158 | */ | 193 | */ |
| 159 | spin_lock_irq(&child->sighand->siglock); | 194 | if (ignore_state || ptrace_freeze_traced(child)) |
| 160 | WARN_ON_ONCE(task_is_stopped(child)); | ||
| 161 | if (ignore_state || (task_is_traced(child) && | ||
| 162 | !(child->jobctl & JOBCTL_LISTENING))) | ||
| 163 | ret = 0; | 195 | ret = 0; |
| 164 | spin_unlock_irq(&child->sighand->siglock); | ||
| 165 | } | 196 | } |
| 166 | read_unlock(&tasklist_lock); | 197 | read_unlock(&tasklist_lock); |
| 167 | 198 | ||
| 168 | if (!ret && !ignore_state) | 199 | if (!ret && !ignore_state) { |
| 169 | ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; | 200 | if (!wait_task_inactive(child, __TASK_TRACED)) { |
| 201 | /* | ||
| 202 | * This can only happen if may_ptrace_stop() fails and | ||
| 203 | * ptrace_stop() changes ->state back to TASK_RUNNING, | ||
| 204 | * so we should not worry about leaking __TASK_TRACED. | ||
| 205 | */ | ||
| 206 | WARN_ON(child->state == __TASK_TRACED); | ||
| 207 | ret = -ESRCH; | ||
| 208 | } | ||
| 209 | } | ||
| 170 | 210 | ||
| 171 | /* All systems go.. */ | ||
| 172 | return ret; | 211 | return ret; |
| 173 | } | 212 | } |
| 174 | 213 | ||
| @@ -317,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
| 317 | */ | 356 | */ |
| 318 | if (task_is_stopped(task) && | 357 | if (task_is_stopped(task) && |
| 319 | task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) | 358 | task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) |
| 320 | signal_wake_up(task, 1); | 359 | signal_wake_up_state(task, __TASK_STOPPED); |
| 321 | 360 | ||
| 322 | spin_unlock(&task->sighand->siglock); | 361 | spin_unlock(&task->sighand->siglock); |
| 323 | 362 | ||
| @@ -673,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | |||
| 673 | kiov->iov_len, kiov->iov_base); | 712 | kiov->iov_len, kiov->iov_base); |
| 674 | } | 713 | } |
| 675 | 714 | ||
| 715 | /* | ||
| 716 | * This is declared in linux/regset.h and defined in machine-dependent | ||
| 717 | * code. We put the export here, near the primary machine-neutral use, | ||
| 718 | * to ensure no machine forgets it. | ||
| 719 | */ | ||
| 720 | EXPORT_SYMBOL_GPL(task_user_regset_view); | ||
| 676 | #endif | 721 | #endif |
| 677 | 722 | ||
| 678 | int ptrace_request(struct task_struct *child, long request, | 723 | int ptrace_request(struct task_struct *child, long request, |
| @@ -737,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 737 | * tracee into STOP. | 782 | * tracee into STOP. |
| 738 | */ | 783 | */ |
| 739 | if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) | 784 | if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) |
| 740 | signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); | 785 | ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); |
| 741 | 786 | ||
| 742 | unlock_task_sighand(child, &flags); | 787 | unlock_task_sighand(child, &flags); |
| 743 | ret = 0; | 788 | ret = 0; |
| @@ -763,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 763 | * start of this trap and now. Trigger re-trap. | 808 | * start of this trap and now. Trigger re-trap. |
| 764 | */ | 809 | */ |
| 765 | if (child->jobctl & JOBCTL_TRAP_NOTIFY) | 810 | if (child->jobctl & JOBCTL_TRAP_NOTIFY) |
| 766 | signal_wake_up(child, true); | 811 | ptrace_signal_wake_up(child, true); |
| 767 | ret = 0; | 812 | ret = 0; |
| 768 | } | 813 | } |
| 769 | unlock_task_sighand(child, &flags); | 814 | unlock_task_sighand(child, &flags); |
| @@ -900,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, | |||
| 900 | goto out_put_task_struct; | 945 | goto out_put_task_struct; |
| 901 | 946 | ||
| 902 | ret = arch_ptrace(child, request, addr, data); | 947 | ret = arch_ptrace(child, request, addr, data); |
| 948 | if (ret || request != PTRACE_DETACH) | ||
| 949 | ptrace_unfreeze_traced(child); | ||
| 903 | 950 | ||
| 904 | out_put_task_struct: | 951 | out_put_task_struct: |
| 905 | put_task_struct(child); | 952 | put_task_struct(child); |
| @@ -1039,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
| 1039 | 1086 | ||
| 1040 | ret = ptrace_check_attach(child, request == PTRACE_KILL || | 1087 | ret = ptrace_check_attach(child, request == PTRACE_KILL || |
| 1041 | request == PTRACE_INTERRUPT); | 1088 | request == PTRACE_INTERRUPT); |
| 1042 | if (!ret) | 1089 | if (!ret) { |
| 1043 | ret = compat_arch_ptrace(child, request, addr, data); | 1090 | ret = compat_arch_ptrace(child, request, addr, data); |
| 1091 | if (ret || request != PTRACE_DETACH) | ||
| 1092 | ptrace_unfreeze_traced(child); | ||
| 1093 | } | ||
| 1044 | 1094 | ||
| 1045 | out_put_task_struct: | 1095 | out_put_task_struct: |
| 1046 | put_task_struct(child); | 1096 | put_task_struct(child); |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 20dfba576c2b..7f8e7590e3e5 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
| @@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | |||
| 111 | 111 | ||
| 112 | extern int rcu_expedited; | 112 | extern int rcu_expedited; |
| 113 | 113 | ||
| 114 | #ifdef CONFIG_RCU_STALL_COMMON | ||
| 115 | |||
| 116 | extern int rcu_cpu_stall_suppress; | ||
| 117 | int rcu_jiffies_till_stall_check(void); | ||
| 118 | |||
| 119 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | ||
| 120 | |||
| 114 | #endif /* __LINUX_RCU_H */ | 121 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b44..48ab70384a4c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | |||
| 404 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 404 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 405 | 405 | ||
| 406 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 406 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
| 407 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) | 407 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, |
| 408 | unsigned long secs, | ||
| 409 | unsigned long c_old, unsigned long c) | ||
| 408 | { | 410 | { |
| 409 | trace_rcu_torture_read(rcutorturename, rhp); | 411 | trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); |
| 410 | } | 412 | } |
| 411 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | 413 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); |
| 412 | #else | 414 | #else |
| 413 | #define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) | 415 | #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ |
| 416 | do { } while (0) | ||
| 414 | #endif | 417 | #endif |
| 418 | |||
| 419 | #ifdef CONFIG_RCU_STALL_COMMON | ||
| 420 | |||
| 421 | #ifdef CONFIG_PROVE_RCU | ||
| 422 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | ||
| 423 | #else | ||
| 424 | #define RCU_STALL_DELAY_DELTA 0 | ||
| 425 | #endif | ||
| 426 | |||
| 427 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | ||
| 428 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | ||
| 429 | |||
| 430 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
| 431 | module_param(rcu_cpu_stall_timeout, int, 0644); | ||
| 432 | |||
| 433 | int rcu_jiffies_till_stall_check(void) | ||
| 434 | { | ||
| 435 | int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); | ||
| 436 | |||
| 437 | /* | ||
| 438 | * Limit check must be consistent with the Kconfig limits | ||
| 439 | * for CONFIG_RCU_CPU_STALL_TIMEOUT. | ||
| 440 | */ | ||
| 441 | if (till_stall_check < 3) { | ||
| 442 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; | ||
| 443 | till_stall_check = 3; | ||
| 444 | } else if (till_stall_check > 300) { | ||
| 445 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; | ||
| 446 | till_stall_check = 300; | ||
| 447 | } | ||
| 448 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | ||
| 449 | } | ||
| 450 | |||
| 451 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | ||
| 452 | { | ||
| 453 | rcu_cpu_stall_suppress = 1; | ||
| 454 | return NOTIFY_DONE; | ||
| 455 | } | ||
| 456 | |||
| 457 | static struct notifier_block rcu_panic_block = { | ||
| 458 | .notifier_call = rcu_panic, | ||
| 459 | }; | ||
| 460 | |||
| 461 | static int __init check_cpu_stall_init(void) | ||
| 462 | { | ||
| 463 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | ||
| 464 | return 0; | ||
| 465 | } | ||
| 466 | early_initcall(check_cpu_stall_init); | ||
| 467 | |||
| 468 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2a..a0714a51b6d7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head, | |||
| 51 | void (*func)(struct rcu_head *rcu), | 51 | void (*func)(struct rcu_head *rcu), |
| 52 | struct rcu_ctrlblk *rcp); | 52 | struct rcu_ctrlblk *rcp); |
| 53 | 53 | ||
| 54 | #include "rcutiny_plugin.h" | ||
| 55 | |||
| 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 54 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 57 | 55 | ||
| 56 | #include "rcutiny_plugin.h" | ||
| 57 | |||
| 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
| 59 | static void rcu_idle_enter_common(long long newval) | 59 | static void rcu_idle_enter_common(long long newval) |
| 60 | { | 60 | { |
| @@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); | |||
| 193 | * interrupts don't count, we must be running at the first interrupt | 193 | * interrupts don't count, we must be running at the first interrupt |
| 194 | * level. | 194 | * level. |
| 195 | */ | 195 | */ |
| 196 | int rcu_is_cpu_rrupt_from_idle(void) | 196 | static int rcu_is_cpu_rrupt_from_idle(void) |
| 197 | { | 197 | { |
| 198 | return rcu_dynticks_nesting <= 1; | 198 | return rcu_dynticks_nesting <= 1; |
| 199 | } | 199 | } |
| @@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void) | |||
| 205 | */ | 205 | */ |
| 206 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 206 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
| 207 | { | 207 | { |
| 208 | reset_cpu_stall_ticks(rcp); | ||
| 208 | if (rcp->rcucblist != NULL && | 209 | if (rcp->rcucblist != NULL && |
| 209 | rcp->donetail != rcp->curtail) { | 210 | rcp->donetail != rcp->curtail) { |
| 210 | rcp->donetail = rcp->curtail; | 211 | rcp->donetail = rcp->curtail; |
| @@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu) | |||
| 251 | */ | 252 | */ |
| 252 | void rcu_check_callbacks(int cpu, int user) | 253 | void rcu_check_callbacks(int cpu, int user) |
| 253 | { | 254 | { |
| 255 | check_cpu_stalls(); | ||
| 254 | if (user || rcu_is_cpu_rrupt_from_idle()) | 256 | if (user || rcu_is_cpu_rrupt_from_idle()) |
| 255 | rcu_sched_qs(cpu); | 257 | rcu_sched_qs(cpu); |
| 256 | else if (!in_softirq()) | 258 | else if (!in_softirq()) |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f85016a2309b..8a233002faeb 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -33,6 +33,9 @@ struct rcu_ctrlblk { | |||
| 33 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | 33 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ |
| 34 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | 34 | struct rcu_head **curtail; /* ->next pointer of last CB. */ |
| 35 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | 35 | RCU_TRACE(long qlen); /* Number of pending CBs. */ |
| 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | ||
| 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | ||
| 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | ||
| 36 | RCU_TRACE(char *name); /* Name of RCU type. */ | 39 | RCU_TRACE(char *name); /* Name of RCU type. */ |
| 37 | }; | 40 | }; |
| 38 | 41 | ||
| @@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly; | |||
| 54 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 57 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
| 55 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 58 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
| 56 | 59 | ||
| 60 | #ifdef CONFIG_RCU_TRACE | ||
| 61 | |||
| 62 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 63 | { | ||
| 64 | unsigned long j; | ||
| 65 | unsigned long js; | ||
| 66 | |||
| 67 | if (rcu_cpu_stall_suppress) | ||
| 68 | return; | ||
| 69 | rcp->ticks_this_gp++; | ||
| 70 | j = jiffies; | ||
| 71 | js = rcp->jiffies_stall; | ||
| 72 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { | ||
| 73 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | ||
| 74 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, | ||
| 75 | jiffies - rcp->gp_start, rcp->qlen); | ||
| 76 | dump_stack(); | ||
| 77 | } | ||
| 78 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) | ||
| 79 | rcp->jiffies_stall = jiffies + | ||
| 80 | 3 * rcu_jiffies_till_stall_check() + 3; | ||
| 81 | else if (ULONG_CMP_GE(j, js)) | ||
| 82 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
| 83 | } | ||
| 84 | |||
| 85 | static void check_cpu_stall_preempt(void); | ||
| 86 | |||
| 87 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 88 | |||
| 89 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | ||
| 90 | { | ||
| 91 | #ifdef CONFIG_RCU_TRACE | ||
| 92 | rcp->ticks_this_gp = 0; | ||
| 93 | rcp->gp_start = jiffies; | ||
| 94 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
| 95 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 96 | } | ||
| 97 | |||
| 98 | static void check_cpu_stalls(void) | ||
| 99 | { | ||
| 100 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | ||
| 101 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | ||
| 102 | RCU_TRACE(check_cpu_stall_preempt()); | ||
| 103 | } | ||
| 104 | |||
| 57 | #ifdef CONFIG_TINY_PREEMPT_RCU | 105 | #ifdef CONFIG_TINY_PREEMPT_RCU |
| 58 | 106 | ||
| 59 | #include <linux/delay.h> | 107 | #include <linux/delay.h> |
| @@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void) | |||
| 448 | /* Official start of GP. */ | 496 | /* Official start of GP. */ |
| 449 | rcu_preempt_ctrlblk.gpnum++; | 497 | rcu_preempt_ctrlblk.gpnum++; |
| 450 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | 498 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); |
| 499 | reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); | ||
| 451 | 500 | ||
| 452 | /* Any blocked RCU readers block new GP. */ | 501 | /* Any blocked RCU readers block new GP. */ |
| 453 | if (rcu_preempt_blocked_readers_any()) | 502 | if (rcu_preempt_blocked_readers_any()) |
| @@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney"); | |||
| 1054 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | 1103 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); |
| 1055 | MODULE_LICENSE("GPL"); | 1104 | MODULE_LICENSE("GPL"); |
| 1056 | 1105 | ||
| 1106 | static void check_cpu_stall_preempt(void) | ||
| 1107 | { | ||
| 1108 | #ifdef CONFIG_TINY_PREEMPT_RCU | ||
| 1109 | check_cpu_stall(&rcu_preempt_ctrlblk.rcb); | ||
| 1110 | #endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
| 1111 | } | ||
| 1112 | |||
| 1057 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 1113 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85fd..e1f3a8c96724 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <linux/stat.h> | 46 | #include <linux/stat.h> |
| 47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
| 48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
| 49 | #include <linux/trace_clock.h> | ||
| 49 | #include <asm/byteorder.h> | 50 | #include <asm/byteorder.h> |
| 50 | 51 | ||
| 51 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
| @@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | |||
| 207 | #define rcu_can_boost() 0 | 208 | #define rcu_can_boost() 0 |
| 208 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | 209 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
| 209 | 210 | ||
| 211 | #ifdef CONFIG_RCU_TRACE | ||
| 212 | static u64 notrace rcu_trace_clock_local(void) | ||
| 213 | { | ||
| 214 | u64 ts = trace_clock_local(); | ||
| 215 | unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); | ||
| 216 | return ts; | ||
| 217 | } | ||
| 218 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 219 | static u64 notrace rcu_trace_clock_local(void) | ||
| 220 | { | ||
| 221 | return 0ULL; | ||
| 222 | } | ||
| 223 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 224 | |||
| 210 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | 225 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ |
| 211 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 226 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
| 212 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 227 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
| @@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg) | |||
| 845 | /* Wait for the next test interval. */ | 860 | /* Wait for the next test interval. */ |
| 846 | oldstarttime = boost_starttime; | 861 | oldstarttime = boost_starttime; |
| 847 | while (ULONG_CMP_LT(jiffies, oldstarttime)) { | 862 | while (ULONG_CMP_LT(jiffies, oldstarttime)) { |
| 848 | schedule_timeout_uninterruptible(1); | 863 | schedule_timeout_interruptible(oldstarttime - jiffies); |
| 849 | rcu_stutter_wait("rcu_torture_boost"); | 864 | rcu_stutter_wait("rcu_torture_boost"); |
| 850 | if (kthread_should_stop() || | 865 | if (kthread_should_stop() || |
| 851 | fullstop != FULLSTOP_DONTSTOP) | 866 | fullstop != FULLSTOP_DONTSTOP) |
| @@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void) | |||
| 1028 | return; | 1043 | return; |
| 1029 | if (atomic_xchg(&beenhere, 1) != 0) | 1044 | if (atomic_xchg(&beenhere, 1) != 0) |
| 1030 | return; | 1045 | return; |
| 1031 | do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); | ||
| 1032 | ftrace_dump(DUMP_ALL); | 1046 | ftrace_dump(DUMP_ALL); |
| 1033 | } | 1047 | } |
| 1034 | 1048 | ||
| @@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused) | |||
| 1042 | { | 1056 | { |
| 1043 | int idx; | 1057 | int idx; |
| 1044 | int completed; | 1058 | int completed; |
| 1059 | int completed_end; | ||
| 1045 | static DEFINE_RCU_RANDOM(rand); | 1060 | static DEFINE_RCU_RANDOM(rand); |
| 1046 | static DEFINE_SPINLOCK(rand_lock); | 1061 | static DEFINE_SPINLOCK(rand_lock); |
| 1047 | struct rcu_torture *p; | 1062 | struct rcu_torture *p; |
| 1048 | int pipe_count; | 1063 | int pipe_count; |
| 1064 | unsigned long long ts; | ||
| 1049 | 1065 | ||
| 1050 | idx = cur_ops->readlock(); | 1066 | idx = cur_ops->readlock(); |
| 1051 | completed = cur_ops->completed(); | 1067 | completed = cur_ops->completed(); |
| 1068 | ts = rcu_trace_clock_local(); | ||
| 1052 | p = rcu_dereference_check(rcu_torture_current, | 1069 | p = rcu_dereference_check(rcu_torture_current, |
| 1053 | rcu_read_lock_bh_held() || | 1070 | rcu_read_lock_bh_held() || |
| 1054 | rcu_read_lock_sched_held() || | 1071 | rcu_read_lock_sched_held() || |
| @@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused) | |||
| 1058 | cur_ops->readunlock(idx); | 1075 | cur_ops->readunlock(idx); |
| 1059 | return; | 1076 | return; |
| 1060 | } | 1077 | } |
| 1061 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
| 1062 | if (p->rtort_mbtest == 0) | 1078 | if (p->rtort_mbtest == 0) |
| 1063 | atomic_inc(&n_rcu_torture_mberror); | 1079 | atomic_inc(&n_rcu_torture_mberror); |
| 1064 | spin_lock(&rand_lock); | 1080 | spin_lock(&rand_lock); |
| @@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused) | |||
| 1071 | /* Should not happen, but... */ | 1087 | /* Should not happen, but... */ |
| 1072 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1088 | pipe_count = RCU_TORTURE_PIPE_LEN; |
| 1073 | } | 1089 | } |
| 1074 | if (pipe_count > 1) | 1090 | completed_end = cur_ops->completed(); |
| 1091 | if (pipe_count > 1) { | ||
| 1092 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, | ||
| 1093 | completed, completed_end); | ||
| 1075 | rcutorture_trace_dump(); | 1094 | rcutorture_trace_dump(); |
| 1095 | } | ||
| 1076 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1096 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
| 1077 | completed = cur_ops->completed() - completed; | 1097 | completed = completed_end - completed; |
| 1078 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1098 | if (completed > RCU_TORTURE_PIPE_LEN) { |
| 1079 | /* Should not happen, but... */ | 1099 | /* Should not happen, but... */ |
| 1080 | completed = RCU_TORTURE_PIPE_LEN; | 1100 | completed = RCU_TORTURE_PIPE_LEN; |
| @@ -1094,11 +1114,13 @@ static int | |||
| 1094 | rcu_torture_reader(void *arg) | 1114 | rcu_torture_reader(void *arg) |
| 1095 | { | 1115 | { |
| 1096 | int completed; | 1116 | int completed; |
| 1117 | int completed_end; | ||
| 1097 | int idx; | 1118 | int idx; |
| 1098 | DEFINE_RCU_RANDOM(rand); | 1119 | DEFINE_RCU_RANDOM(rand); |
| 1099 | struct rcu_torture *p; | 1120 | struct rcu_torture *p; |
| 1100 | int pipe_count; | 1121 | int pipe_count; |
| 1101 | struct timer_list t; | 1122 | struct timer_list t; |
| 1123 | unsigned long long ts; | ||
| 1102 | 1124 | ||
| 1103 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | 1125 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); |
| 1104 | set_user_nice(current, 19); | 1126 | set_user_nice(current, 19); |
| @@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg) | |||
| 1112 | } | 1134 | } |
| 1113 | idx = cur_ops->readlock(); | 1135 | idx = cur_ops->readlock(); |
| 1114 | completed = cur_ops->completed(); | 1136 | completed = cur_ops->completed(); |
| 1137 | ts = rcu_trace_clock_local(); | ||
| 1115 | p = rcu_dereference_check(rcu_torture_current, | 1138 | p = rcu_dereference_check(rcu_torture_current, |
| 1116 | rcu_read_lock_bh_held() || | 1139 | rcu_read_lock_bh_held() || |
| 1117 | rcu_read_lock_sched_held() || | 1140 | rcu_read_lock_sched_held() || |
| @@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg) | |||
| 1122 | schedule_timeout_interruptible(HZ); | 1145 | schedule_timeout_interruptible(HZ); |
| 1123 | continue; | 1146 | continue; |
| 1124 | } | 1147 | } |
| 1125 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
| 1126 | if (p->rtort_mbtest == 0) | 1148 | if (p->rtort_mbtest == 0) |
| 1127 | atomic_inc(&n_rcu_torture_mberror); | 1149 | atomic_inc(&n_rcu_torture_mberror); |
| 1128 | cur_ops->read_delay(&rand); | 1150 | cur_ops->read_delay(&rand); |
| @@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg) | |||
| 1132 | /* Should not happen, but... */ | 1154 | /* Should not happen, but... */ |
| 1133 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1155 | pipe_count = RCU_TORTURE_PIPE_LEN; |
| 1134 | } | 1156 | } |
| 1135 | if (pipe_count > 1) | 1157 | completed_end = cur_ops->completed(); |
| 1158 | if (pipe_count > 1) { | ||
| 1159 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, | ||
| 1160 | ts, completed, completed_end); | ||
| 1136 | rcutorture_trace_dump(); | 1161 | rcutorture_trace_dump(); |
| 1162 | } | ||
| 1137 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1163 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
| 1138 | completed = cur_ops->completed() - completed; | 1164 | completed = completed_end - completed; |
| 1139 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1165 | if (completed > RCU_TORTURE_PIPE_LEN) { |
| 1140 | /* Should not happen, but... */ | 1166 | /* Should not happen, but... */ |
| 1141 | completed = RCU_TORTURE_PIPE_LEN; | 1167 | completed = RCU_TORTURE_PIPE_LEN; |
| @@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void) | |||
| 1301 | set_cpus_allowed_ptr(reader_tasks[i], | 1327 | set_cpus_allowed_ptr(reader_tasks[i], |
| 1302 | shuffle_tmp_mask); | 1328 | shuffle_tmp_mask); |
| 1303 | } | 1329 | } |
| 1304 | |||
| 1305 | if (fakewriter_tasks) { | 1330 | if (fakewriter_tasks) { |
| 1306 | for (i = 0; i < nfakewriters; i++) | 1331 | for (i = 0; i < nfakewriters; i++) |
| 1307 | if (fakewriter_tasks[i]) | 1332 | if (fakewriter_tasks[i]) |
| 1308 | set_cpus_allowed_ptr(fakewriter_tasks[i], | 1333 | set_cpus_allowed_ptr(fakewriter_tasks[i], |
| 1309 | shuffle_tmp_mask); | 1334 | shuffle_tmp_mask); |
| 1310 | } | 1335 | } |
| 1311 | |||
| 1312 | if (writer_task) | 1336 | if (writer_task) |
| 1313 | set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); | 1337 | set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); |
| 1314 | |||
| 1315 | if (stats_task) | 1338 | if (stats_task) |
| 1316 | set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); | 1339 | set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); |
| 1340 | if (stutter_task) | ||
| 1341 | set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); | ||
| 1342 | if (fqs_task) | ||
| 1343 | set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); | ||
| 1344 | if (shutdown_task) | ||
| 1345 | set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); | ||
| 1346 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1347 | if (onoff_task) | ||
| 1348 | set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); | ||
| 1349 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1350 | if (stall_task) | ||
| 1351 | set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); | ||
| 1352 | if (barrier_cbs_tasks) | ||
| 1353 | for (i = 0; i < n_barrier_cbs; i++) | ||
| 1354 | if (barrier_cbs_tasks[i]) | ||
| 1355 | set_cpus_allowed_ptr(barrier_cbs_tasks[i], | ||
| 1356 | shuffle_tmp_mask); | ||
| 1357 | if (barrier_task) | ||
| 1358 | set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); | ||
| 1317 | 1359 | ||
| 1318 | if (rcu_idle_cpu == -1) | 1360 | if (rcu_idle_cpu == -1) |
| 1319 | rcu_idle_cpu = num_online_cpus() - 1; | 1361 | rcu_idle_cpu = num_online_cpus() - 1; |
| @@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void) | |||
| 1749 | barrier_cbs_wq = | 1791 | barrier_cbs_wq = |
| 1750 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | 1792 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), |
| 1751 | GFP_KERNEL); | 1793 | GFP_KERNEL); |
| 1752 | if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) | 1794 | if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) |
| 1753 | return -ENOMEM; | 1795 | return -ENOMEM; |
| 1754 | for (i = 0; i < n_barrier_cbs; i++) { | 1796 | for (i = 0; i < n_barrier_cbs; i++) { |
| 1755 | init_waitqueue_head(&barrier_cbs_wq[i]); | 1797 | init_waitqueue_head(&barrier_cbs_wq[i]); |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..5b8ad827fd86 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | |||
| 105 | * The rcu_scheduler_active variable transitions from zero to one just | 105 | * The rcu_scheduler_active variable transitions from zero to one just |
| 106 | * before the first task is spawned. So when this variable is zero, RCU | 106 | * before the first task is spawned. So when this variable is zero, RCU |
| 107 | * can assume that there is but one task, allowing RCU to (for example) | 107 | * can assume that there is but one task, allowing RCU to (for example) |
| 108 | * optimized synchronize_sched() to a simple barrier(). When this variable | 108 | * optimize synchronize_sched() to a simple barrier(). When this variable |
| 109 | * is one, RCU must actually do all the hard work required to detect real | 109 | * is one, RCU must actually do all the hard work required to detect real |
| 110 | * grace periods. This variable is also used to suppress boot-time false | 110 | * grace periods. This variable is also used to suppress boot-time false |
| 111 | * positives from lockdep-RCU error checking. | 111 | * positives from lockdep-RCU error checking. |
| @@ -217,12 +217,6 @@ module_param(blimit, long, 0444); | |||
| 217 | module_param(qhimark, long, 0444); | 217 | module_param(qhimark, long, 0444); |
| 218 | module_param(qlowmark, long, 0444); | 218 | module_param(qlowmark, long, 0444); |
| 219 | 219 | ||
| 220 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | ||
| 221 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | ||
| 222 | |||
| 223 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
| 224 | module_param(rcu_cpu_stall_timeout, int, 0644); | ||
| 225 | |||
| 226 | static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; | 220 | static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; |
| 227 | static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; | 221 | static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; |
| 228 | 222 | ||
| @@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
| 305 | } | 299 | } |
| 306 | 300 | ||
| 307 | /* | 301 | /* |
| 308 | * Does the current CPU require a yet-as-unscheduled grace period? | 302 | * Does the current CPU require a not-yet-started grace period? |
| 303 | * The caller must have disabled interrupts to prevent races with | ||
| 304 | * normal callback registry. | ||
| 309 | */ | 305 | */ |
| 310 | static int | 306 | static int |
| 311 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 307 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
| 312 | { | 308 | { |
| 313 | struct rcu_head **ntp; | 309 | int i; |
| 314 | 310 | ||
| 315 | ntp = rdp->nxttail[RCU_DONE_TAIL + | 311 | if (rcu_gp_in_progress(rsp)) |
| 316 | (ACCESS_ONCE(rsp->completed) != rdp->completed)]; | 312 | return 0; /* No, a grace period is already in progress. */ |
| 317 | return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && | 313 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
| 318 | !rcu_gp_in_progress(rsp); | 314 | return 0; /* No, this is a no-CBs (or offline) CPU. */ |
| 315 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | ||
| 316 | return 1; /* Yes, this CPU has newly registered callbacks. */ | ||
| 317 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | ||
| 318 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | ||
| 319 | ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), | ||
| 320 | rdp->nxtcompleted[i])) | ||
| 321 | return 1; /* Yes, CBs for future grace period. */ | ||
| 322 | return 0; /* No grace period needed. */ | ||
| 319 | } | 323 | } |
| 320 | 324 | ||
| 321 | /* | 325 | /* |
| @@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
| 336 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 340 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
| 337 | bool user) | 341 | bool user) |
| 338 | { | 342 | { |
| 339 | trace_rcu_dyntick("Start", oldval, 0); | 343 | trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); |
| 340 | if (!user && !is_idle_task(current)) { | 344 | if (!user && !is_idle_task(current)) { |
| 341 | struct task_struct *idle = idle_task(smp_processor_id()); | 345 | struct task_struct *idle = idle_task(smp_processor_id()); |
| 342 | 346 | ||
| @@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | |||
| 727 | * interrupt from idle, return true. The caller must have at least | 731 | * interrupt from idle, return true. The caller must have at least |
| 728 | * disabled preemption. | 732 | * disabled preemption. |
| 729 | */ | 733 | */ |
| 730 | int rcu_is_cpu_rrupt_from_idle(void) | 734 | static int rcu_is_cpu_rrupt_from_idle(void) |
| 731 | { | 735 | { |
| 732 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; | 736 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; |
| 733 | } | 737 | } |
| @@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 793 | return 0; | 797 | return 0; |
| 794 | } | 798 | } |
| 795 | 799 | ||
| 796 | static int jiffies_till_stall_check(void) | ||
| 797 | { | ||
| 798 | int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); | ||
| 799 | |||
| 800 | /* | ||
| 801 | * Limit check must be consistent with the Kconfig limits | ||
| 802 | * for CONFIG_RCU_CPU_STALL_TIMEOUT. | ||
| 803 | */ | ||
| 804 | if (till_stall_check < 3) { | ||
| 805 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; | ||
| 806 | till_stall_check = 3; | ||
| 807 | } else if (till_stall_check > 300) { | ||
| 808 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; | ||
| 809 | till_stall_check = 300; | ||
| 810 | } | ||
| 811 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | ||
| 812 | } | ||
| 813 | |||
| 814 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 800 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
| 815 | { | 801 | { |
| 816 | rsp->gp_start = jiffies; | 802 | rsp->gp_start = jiffies; |
| 817 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); | 803 | rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); |
| 818 | } | 804 | } |
| 819 | 805 | ||
| 820 | /* | 806 | /* |
| @@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 857 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 843 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 858 | return; | 844 | return; |
| 859 | } | 845 | } |
| 860 | rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; | 846 | rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; |
| 861 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 847 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 862 | 848 | ||
| 863 | /* | 849 | /* |
| @@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 935 | raw_spin_lock_irqsave(&rnp->lock, flags); | 921 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 936 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | 922 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
| 937 | rsp->jiffies_stall = jiffies + | 923 | rsp->jiffies_stall = jiffies + |
| 938 | 3 * jiffies_till_stall_check() + 3; | 924 | 3 * rcu_jiffies_till_stall_check() + 3; |
| 939 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 925 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 940 | 926 | ||
| 941 | set_need_resched(); /* kick ourselves to get things going. */ | 927 | set_need_resched(); /* kick ourselves to get things going. */ |
| @@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 966 | } | 952 | } |
| 967 | } | 953 | } |
| 968 | 954 | ||
| 969 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | ||
| 970 | { | ||
| 971 | rcu_cpu_stall_suppress = 1; | ||
| 972 | return NOTIFY_DONE; | ||
| 973 | } | ||
| 974 | |||
| 975 | /** | 955 | /** |
| 976 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | 956 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period |
| 977 | * | 957 | * |
| @@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void) | |||
| 989 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; | 969 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; |
| 990 | } | 970 | } |
| 991 | 971 | ||
| 992 | static struct notifier_block rcu_panic_block = { | ||
| 993 | .notifier_call = rcu_panic, | ||
| 994 | }; | ||
| 995 | |||
| 996 | static void __init check_cpu_stall_init(void) | ||
| 997 | { | ||
| 998 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | ||
| 999 | } | ||
| 1000 | |||
| 1001 | /* | 972 | /* |
| 1002 | * Update CPU-local rcu_data state to record the newly noticed grace period. | 973 | * Update CPU-local rcu_data state to record the newly noticed grace period. |
| 1003 | * This is used both when we started the grace period and when we notice | 974 | * This is used both when we started the grace period and when we notice |
| @@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp) | |||
| 1071 | } | 1042 | } |
| 1072 | 1043 | ||
| 1073 | /* | 1044 | /* |
| 1045 | * Determine the value that ->completed will have at the end of the | ||
| 1046 | * next subsequent grace period. This is used to tag callbacks so that | ||
| 1047 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | ||
| 1048 | * been dyntick-idle for an extended period with callbacks under the | ||
| 1049 | * influence of RCU_FAST_NO_HZ. | ||
| 1050 | * | ||
| 1051 | * The caller must hold rnp->lock with interrupts disabled. | ||
| 1052 | */ | ||
| 1053 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | ||
| 1054 | struct rcu_node *rnp) | ||
| 1055 | { | ||
| 1056 | /* | ||
| 1057 | * If RCU is idle, we just wait for the next grace period. | ||
| 1058 | * But we can only be sure that RCU is idle if we are looking | ||
| 1059 | * at the root rcu_node structure -- otherwise, a new grace | ||
| 1060 | * period might have started, but just not yet gotten around | ||
| 1061 | * to initializing the current non-root rcu_node structure. | ||
| 1062 | */ | ||
| 1063 | if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) | ||
| 1064 | return rnp->completed + 1; | ||
| 1065 | |||
| 1066 | /* | ||
| 1067 | * Otherwise, wait for a possible partial grace period and | ||
| 1068 | * then the subsequent full grace period. | ||
| 1069 | */ | ||
| 1070 | return rnp->completed + 2; | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | /* | ||
| 1074 | * If there is room, assign a ->completed number to any callbacks on | ||
| 1075 | * this CPU that have not already been assigned. Also accelerate any | ||
| 1076 | * callbacks that were previously assigned a ->completed number that has | ||
| 1077 | * since proven to be too conservative, which can happen if callbacks get | ||
| 1078 | * assigned a ->completed number while RCU is idle, but with reference to | ||
| 1079 | * a non-root rcu_node structure. This function is idempotent, so it does | ||
| 1080 | * not hurt to call it repeatedly. | ||
| 1081 | * | ||
| 1082 | * The caller must hold rnp->lock with interrupts disabled. | ||
| 1083 | */ | ||
| 1084 | static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 1085 | struct rcu_data *rdp) | ||
| 1086 | { | ||
| 1087 | unsigned long c; | ||
| 1088 | int i; | ||
| 1089 | |||
| 1090 | /* If the CPU has no callbacks, nothing to do. */ | ||
| 1091 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1092 | return; | ||
| 1093 | |||
| 1094 | /* | ||
| 1095 | * Starting from the sublist containing the callbacks most | ||
| 1096 | * recently assigned a ->completed number and working down, find the | ||
| 1097 | * first sublist that is not assignable to an upcoming grace period. | ||
| 1098 | * Such a sublist has something in it (first two tests) and has | ||
| 1099 | * a ->completed number assigned that will complete sooner than | ||
| 1100 | * the ->completed number for newly arrived callbacks (last test). | ||
| 1101 | * | ||
| 1102 | * The key point is that any later sublist can be assigned the | ||
| 1103 | * same ->completed number as the newly arrived callbacks, which | ||
| 1104 | * means that the callbacks in any of these later sublist can be | ||
| 1105 | * grouped into a single sublist, whether or not they have already | ||
| 1106 | * been assigned a ->completed number. | ||
| 1107 | */ | ||
| 1108 | c = rcu_cbs_completed(rsp, rnp); | ||
| 1109 | for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) | ||
| 1110 | if (rdp->nxttail[i] != rdp->nxttail[i - 1] && | ||
| 1111 | !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) | ||
| 1112 | break; | ||
| 1113 | |||
| 1114 | /* | ||
| 1115 | * If there are no sublist for unassigned callbacks, leave. | ||
| 1116 | * At the same time, advance "i" one sublist, so that "i" will | ||
| 1117 | * index into the sublist where all the remaining callbacks should | ||
| 1118 | * be grouped into. | ||
| 1119 | */ | ||
| 1120 | if (++i >= RCU_NEXT_TAIL) | ||
| 1121 | return; | ||
| 1122 | |||
| 1123 | /* | ||
| 1124 | * Assign all subsequent callbacks' ->completed number to the next | ||
| 1125 | * full grace period and group them all in the sublist initially | ||
| 1126 | * indexed by "i". | ||
| 1127 | */ | ||
| 1128 | for (; i <= RCU_NEXT_TAIL; i++) { | ||
| 1129 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1130 | rdp->nxtcompleted[i] = c; | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | /* Trace depending on how much we were able to accelerate. */ | ||
| 1134 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | ||
| 1135 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); | ||
| 1136 | else | ||
| 1137 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | /* | ||
| 1141 | * Move any callbacks whose grace period has completed to the | ||
| 1142 | * RCU_DONE_TAIL sublist, then compact the remaining sublists and | ||
| 1143 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL | ||
| 1144 | * sublist. This function is idempotent, so it does not hurt to | ||
| 1145 | * invoke it repeatedly. As long as it is not invoked -too- often... | ||
| 1146 | * | ||
| 1147 | * The caller must hold rnp->lock with interrupts disabled. | ||
| 1148 | */ | ||
| 1149 | static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 1150 | struct rcu_data *rdp) | ||
| 1151 | { | ||
| 1152 | int i, j; | ||
| 1153 | |||
| 1154 | /* If the CPU has no callbacks, nothing to do. */ | ||
| 1155 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1156 | return; | ||
| 1157 | |||
| 1158 | /* | ||
| 1159 | * Find all callbacks whose ->completed numbers indicate that they | ||
| 1160 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. | ||
| 1161 | */ | ||
| 1162 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | ||
| 1163 | if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) | ||
| 1164 | break; | ||
| 1165 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; | ||
| 1166 | } | ||
| 1167 | /* Clean up any sublist tail pointers that were misordered above. */ | ||
| 1168 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
| 1169 | rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1170 | |||
| 1171 | /* Copy down callbacks to fill in empty sublists. */ | ||
| 1172 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
| 1173 | if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) | ||
| 1174 | break; | ||
| 1175 | rdp->nxttail[j] = rdp->nxttail[i]; | ||
| 1176 | rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | /* Classify any remaining callbacks. */ | ||
| 1180 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
| 1181 | } | ||
| 1182 | |||
| 1183 | /* | ||
| 1074 | * Advance this CPU's callbacks, but only if the current grace period | 1184 | * Advance this CPU's callbacks, but only if the current grace period |
| 1075 | * has ended. This may be called only from the CPU to whom the rdp | 1185 | * has ended. This may be called only from the CPU to whom the rdp |
| 1076 | * belongs. In addition, the corresponding leaf rcu_node structure's | 1186 | * belongs. In addition, the corresponding leaf rcu_node structure's |
| @@ -1080,12 +1190,15 @@ static void | |||
| 1080 | __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 1190 | __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
| 1081 | { | 1191 | { |
| 1082 | /* Did another grace period end? */ | 1192 | /* Did another grace period end? */ |
| 1083 | if (rdp->completed != rnp->completed) { | 1193 | if (rdp->completed == rnp->completed) { |
| 1084 | 1194 | ||
| 1085 | /* Advance callbacks. No harm if list empty. */ | 1195 | /* No, so just accelerate recent callbacks. */ |
| 1086 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; | 1196 | rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1087 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; | 1197 | |
| 1088 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1198 | } else { |
| 1199 | |||
| 1200 | /* Advance callbacks. */ | ||
| 1201 | rcu_advance_cbs(rsp, rnp, rdp); | ||
| 1089 | 1202 | ||
| 1090 | /* Remember that we saw this grace-period completion. */ | 1203 | /* Remember that we saw this grace-period completion. */ |
| 1091 | rdp->completed = rnp->completed; | 1204 | rdp->completed = rnp->completed; |
| @@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 1392 | /* | 1505 | /* |
| 1393 | * Because there is no grace period in progress right now, | 1506 | * Because there is no grace period in progress right now, |
| 1394 | * any callbacks we have up to this point will be satisfied | 1507 | * any callbacks we have up to this point will be satisfied |
| 1395 | * by the next grace period. So promote all callbacks to be | 1508 | * by the next grace period. So this is a good place to |
| 1396 | * handled after the end of the next grace period. If the | 1509 | * assign a grace period number to recently posted callbacks. |
| 1397 | * CPU is not yet aware of the end of the previous grace period, | ||
| 1398 | * we need to allow for the callback advancement that will | ||
| 1399 | * occur when it does become aware. Deadlock prevents us from | ||
| 1400 | * making it aware at this point: We cannot acquire a leaf | ||
| 1401 | * rcu_node ->lock while holding the root rcu_node ->lock. | ||
| 1402 | */ | 1510 | */ |
| 1403 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1511 | rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1404 | if (rdp->completed == rsp->completed) | ||
| 1405 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1406 | 1512 | ||
| 1407 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1513 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
| 1408 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | 1514 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ |
| @@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1527 | * This GP can't end until cpu checks in, so all of our | 1633 | * This GP can't end until cpu checks in, so all of our |
| 1528 | * callbacks can be processed during the next GP. | 1634 | * callbacks can be processed during the next GP. |
| 1529 | */ | 1635 | */ |
| 1530 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1636 | rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1531 | 1637 | ||
| 1532 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | 1638 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ |
| 1533 | } | 1639 | } |
| @@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1779 | long bl, count, count_lazy; | 1885 | long bl, count, count_lazy; |
| 1780 | int i; | 1886 | int i; |
| 1781 | 1887 | ||
| 1782 | /* If no callbacks are ready, just return.*/ | 1888 | /* If no callbacks are ready, just return. */ |
| 1783 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1889 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
| 1784 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); | 1890 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); |
| 1785 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), | 1891 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), |
| @@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
| 2008 | 2114 | ||
| 2009 | WARN_ON_ONCE(rdp->beenonline == 0); | 2115 | WARN_ON_ONCE(rdp->beenonline == 0); |
| 2010 | 2116 | ||
| 2011 | /* | 2117 | /* Handle the end of a grace period that some other CPU ended. */ |
| 2012 | * Advance callbacks in response to end of earlier grace | ||
| 2013 | * period that some other CPU ended. | ||
| 2014 | */ | ||
| 2015 | rcu_process_gp_end(rsp, rdp); | 2118 | rcu_process_gp_end(rsp, rdp); |
| 2016 | 2119 | ||
| 2017 | /* Update RCU state based on any recent quiescent states. */ | 2120 | /* Update RCU state based on any recent quiescent states. */ |
| 2018 | rcu_check_quiescent_state(rsp, rdp); | 2121 | rcu_check_quiescent_state(rsp, rdp); |
| 2019 | 2122 | ||
| 2020 | /* Does this CPU require a not-yet-started grace period? */ | 2123 | /* Does this CPU require a not-yet-started grace period? */ |
| 2124 | local_irq_save(flags); | ||
| 2021 | if (cpu_needs_another_gp(rsp, rdp)) { | 2125 | if (cpu_needs_another_gp(rsp, rdp)) { |
| 2022 | raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); | 2126 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ |
| 2023 | rcu_start_gp(rsp, flags); /* releases above lock */ | 2127 | rcu_start_gp(rsp, flags); /* releases above lock */ |
| 2128 | } else { | ||
| 2129 | local_irq_restore(flags); | ||
| 2024 | } | 2130 | } |
| 2025 | 2131 | ||
| 2026 | /* If there are callbacks ready, invoke them. */ | 2132 | /* If there are callbacks ready, invoke them. */ |
| @@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 2719 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2825 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 2720 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 2826 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
| 2721 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 2827 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
| 2722 | #ifdef CONFIG_RCU_USER_QS | ||
| 2723 | WARN_ON_ONCE(rdp->dynticks->in_user); | ||
| 2724 | #endif | ||
| 2725 | rdp->cpu = cpu; | 2828 | rdp->cpu = cpu; |
| 2726 | rdp->rsp = rsp; | 2829 | rdp->rsp = rsp; |
| 2727 | rcu_boot_init_nocb_percpu_data(rdp); | 2830 | rcu_boot_init_nocb_percpu_data(rdp); |
| @@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 2938 | 3041 | ||
| 2939 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ | 3042 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ |
| 2940 | 3043 | ||
| 3044 | /* Silence gcc 4.8 warning about array index out of range. */ | ||
| 3045 | if (rcu_num_lvls > RCU_NUM_LVLS) | ||
| 3046 | panic("rcu_init_one: rcu_num_lvls overflow"); | ||
| 3047 | |||
| 2941 | /* Initialize the level-tracking arrays. */ | 3048 | /* Initialize the level-tracking arrays. */ |
| 2942 | 3049 | ||
| 2943 | for (i = 0; i < rcu_num_lvls; i++) | 3050 | for (i = 0; i < rcu_num_lvls; i++) |
| @@ -3074,7 +3181,6 @@ void __init rcu_init(void) | |||
| 3074 | cpu_notifier(rcu_cpu_notify, 0); | 3181 | cpu_notifier(rcu_cpu_notify, 0); |
| 3075 | for_each_online_cpu(cpu) | 3182 | for_each_online_cpu(cpu) |
| 3076 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3183 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 3077 | check_cpu_stall_init(); | ||
| 3078 | } | 3184 | } |
| 3079 | 3185 | ||
| 3080 | #include "rcutree_plugin.h" | 3186 | #include "rcutree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093d..c896b5045d9d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -102,10 +102,6 @@ struct rcu_dynticks { | |||
| 102 | /* idle-period nonlazy_posted snapshot. */ | 102 | /* idle-period nonlazy_posted snapshot. */ |
| 103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
| 104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 105 | #ifdef CONFIG_RCU_USER_QS | ||
| 106 | bool ignore_user_qs; /* Treat userspace as extended QS or not */ | ||
| 107 | bool in_user; /* Is the CPU in userland from RCU POV? */ | ||
| 108 | #endif | ||
| 109 | }; | 105 | }; |
| 110 | 106 | ||
| 111 | /* RCU's kthread states for tracing. */ | 107 | /* RCU's kthread states for tracing. */ |
| @@ -282,6 +278,8 @@ struct rcu_data { | |||
| 282 | */ | 278 | */ |
| 283 | struct rcu_head *nxtlist; | 279 | struct rcu_head *nxtlist; |
| 284 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | 280 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; |
| 281 | unsigned long nxtcompleted[RCU_NEXT_SIZE]; | ||
| 282 | /* grace periods for sublists. */ | ||
| 285 | long qlen_lazy; /* # of lazy queued callbacks */ | 283 | long qlen_lazy; /* # of lazy queued callbacks */ |
| 286 | long qlen; /* # of queued callbacks, incl lazy */ | 284 | long qlen; /* # of queued callbacks, incl lazy */ |
| 287 | long qlen_last_fqs_check; | 285 | long qlen_last_fqs_check; |
| @@ -343,11 +341,6 @@ struct rcu_data { | |||
| 343 | 341 | ||
| 344 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 342 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
| 345 | 343 | ||
| 346 | #ifdef CONFIG_PROVE_RCU | ||
| 347 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | ||
| 348 | #else | ||
| 349 | #define RCU_STALL_DELAY_DELTA 0 | ||
| 350 | #endif | ||
| 351 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 344 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ |
| 352 | /* to take at least one */ | 345 | /* to take at least one */ |
| 353 | /* scheduling clock irq */ | 346 | /* scheduling clock irq */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f6e5ec2932b4..c1cc7e17ff9d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -40,8 +40,7 @@ | |||
| 40 | #ifdef CONFIG_RCU_NOCB_CPU | 40 | #ifdef CONFIG_RCU_NOCB_CPU |
| 41 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | 41 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ |
| 42 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | 42 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ |
| 43 | static bool rcu_nocb_poll; /* Offload kthread are to poll. */ | 43 | static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ |
| 44 | module_param(rcu_nocb_poll, bool, 0444); | ||
| 45 | static char __initdata nocb_buf[NR_CPUS * 5]; | 44 | static char __initdata nocb_buf[NR_CPUS * 5]; |
| 46 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 45 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 47 | 46 | ||
| @@ -2159,6 +2158,13 @@ static int __init rcu_nocb_setup(char *str) | |||
| 2159 | } | 2158 | } |
| 2160 | __setup("rcu_nocbs=", rcu_nocb_setup); | 2159 | __setup("rcu_nocbs=", rcu_nocb_setup); |
| 2161 | 2160 | ||
| 2161 | static int __init parse_rcu_nocb_poll(char *arg) | ||
| 2162 | { | ||
| 2163 | rcu_nocb_poll = 1; | ||
| 2164 | return 0; | ||
| 2165 | } | ||
| 2166 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | ||
| 2167 | |||
| 2162 | /* Is the specified CPU a no-CPUs CPU? */ | 2168 | /* Is the specified CPU a no-CPUs CPU? */ |
| 2163 | static bool is_nocb_cpu(int cpu) | 2169 | static bool is_nocb_cpu(int cpu) |
| 2164 | { | 2170 | { |
| @@ -2366,10 +2372,11 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2366 | for (;;) { | 2372 | for (;;) { |
| 2367 | /* If not polling, wait for next batch of callbacks. */ | 2373 | /* If not polling, wait for next batch of callbacks. */ |
| 2368 | if (!rcu_nocb_poll) | 2374 | if (!rcu_nocb_poll) |
| 2369 | wait_event(rdp->nocb_wq, rdp->nocb_head); | 2375 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); |
| 2370 | list = ACCESS_ONCE(rdp->nocb_head); | 2376 | list = ACCESS_ONCE(rdp->nocb_head); |
| 2371 | if (!list) { | 2377 | if (!list) { |
| 2372 | schedule_timeout_interruptible(1); | 2378 | schedule_timeout_interruptible(1); |
| 2379 | flush_signals(current); | ||
| 2373 | continue; | 2380 | continue; |
| 2374 | } | 2381 | } |
| 2375 | 2382 | ||
diff --git a/kernel/relay.c b/kernel/relay.c index e8cd2027abbd..01ab081ac53a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | |||
| 1139 | if (!desc->count) | 1139 | if (!desc->count) |
| 1140 | return 0; | 1140 | return 0; |
| 1141 | 1141 | ||
| 1142 | mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); | 1142 | mutex_lock(&file_inode(filp)->i_mutex); |
| 1143 | do { | 1143 | do { |
| 1144 | if (!relay_file_read_avail(buf, *ppos)) | 1144 | if (!relay_file_read_avail(buf, *ppos)) |
| 1145 | break; | 1145 | break; |
| @@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | |||
| 1159 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 1159 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
| 1160 | } | 1160 | } |
| 1161 | } while (desc->count && ret); | 1161 | } while (desc->count && ret); |
| 1162 | mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); | 1162 | mutex_unlock(&file_inode(filp)->i_mutex); |
| 1163 | 1163 | ||
| 1164 | return desc->written; | 1164 | return desc->written; |
| 1165 | } | 1165 | } |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 16502d3a71c8..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | * See rt.c in preempt-rt for proper credits and further information | 17 | * See rt.c in preempt-rt for proper credits and further information |
| 18 | */ | 18 | */ |
| 19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
| 20 | #include <linux/sched/rt.h> | ||
| 20 | #include <linux/delay.h> | 21 | #include <linux/delay.h> |
| 21 | #include <linux/export.h> | 22 | #include <linux/export.h> |
| 22 | #include <linux/spinlock.h> | 23 | #include <linux/spinlock.h> |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 98ec49475460..7890b10084a7 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <linux/kthread.h> | 10 | #include <linux/kthread.h> |
| 11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
| 12 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
| 13 | #include <linux/sched/rt.h> | ||
| 13 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
| 14 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
| 15 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a242e691c993..1e09308bf2a1 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
| 14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
| 15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
| 16 | #include <linux/sched/rt.h> | ||
| 16 | #include <linux/timer.h> | 17 | #include <linux/timer.h> |
| 17 | 18 | ||
| 18 | #include "rtmutex_common.h" | 19 | #include "rtmutex_common.h" |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 6850f53e02d8..b3c6c3fcd847 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
| @@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
| 116 | 116 | ||
| 117 | EXPORT_SYMBOL(down_read_nested); | 117 | EXPORT_SYMBOL(down_read_nested); |
| 118 | 118 | ||
| 119 | void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | ||
| 120 | { | ||
| 121 | might_sleep(); | ||
| 122 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); | ||
| 123 | |||
| 124 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | ||
| 125 | } | ||
| 126 | |||
| 127 | EXPORT_SYMBOL(_down_write_nest_lock); | ||
| 128 | |||
| 119 | void down_write_nested(struct rw_semaphore *sem, int subclass) | 129 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
| 120 | { | 130 | { |
| 121 | might_sleep(); | 131 | might_sleep(); |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..64de5f8b0c9e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
| @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) | |||
| 35 | ag->tg->rt_se = NULL; | 35 | ag->tg->rt_se = NULL; |
| 36 | ag->tg->rt_rq = NULL; | 36 | ag->tg->rt_rq = NULL; |
| 37 | #endif | 37 | #endif |
| 38 | sched_offline_group(ag->tg); | ||
| 38 | sched_destroy_group(ag->tg); | 39 | sched_destroy_group(ag->tg); |
| 39 | } | 40 | } |
| 40 | 41 | ||
| @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void) | |||
| 76 | if (IS_ERR(tg)) | 77 | if (IS_ERR(tg)) |
| 77 | goto out_free; | 78 | goto out_free; |
| 78 | 79 | ||
| 80 | sched_online_group(tg, &root_task_group); | ||
| 81 | |||
| 79 | kref_init(&ag->kref); | 82 | kref_init(&ag->kref); |
| 80 | init_rwsem(&ag->lock); | 83 | init_rwsem(&ag->lock); |
| 81 | ag->id = atomic_inc_return(&autogroup_seq_nr); | 84 | ag->id = atomic_inc_return(&autogroup_seq_nr); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d6fdcdcbb9b1..7f12624a393c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -83,7 +83,7 @@ | |||
| 83 | #endif | 83 | #endif |
| 84 | 84 | ||
| 85 | #include "sched.h" | 85 | #include "sched.h" |
| 86 | #include "../workqueue_sched.h" | 86 | #include "../workqueue_internal.h" |
| 87 | #include "../smpboot.h" | 87 | #include "../smpboot.h" |
| 88 | 88 | ||
| 89 | #define CREATE_TRACE_POINTS | 89 | #define CREATE_TRACE_POINTS |
| @@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
| 1132 | */ | 1132 | */ |
| 1133 | static int select_fallback_rq(int cpu, struct task_struct *p) | 1133 | static int select_fallback_rq(int cpu, struct task_struct *p) |
| 1134 | { | 1134 | { |
| 1135 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); | 1135 | int nid = cpu_to_node(cpu); |
| 1136 | const struct cpumask *nodemask = NULL; | ||
| 1136 | enum { cpuset, possible, fail } state = cpuset; | 1137 | enum { cpuset, possible, fail } state = cpuset; |
| 1137 | int dest_cpu; | 1138 | int dest_cpu; |
| 1138 | 1139 | ||
| 1139 | /* Look for allowed, online CPU in same node. */ | 1140 | /* |
| 1140 | for_each_cpu(dest_cpu, nodemask) { | 1141 | * If the node that the cpu is on has been offlined, cpu_to_node() |
| 1141 | if (!cpu_online(dest_cpu)) | 1142 | * will return -1. There is no cpu on the node, and we should |
| 1142 | continue; | 1143 | * select the cpu on the other node. |
| 1143 | if (!cpu_active(dest_cpu)) | 1144 | */ |
| 1144 | continue; | 1145 | if (nid != -1) { |
| 1145 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 1146 | nodemask = cpumask_of_node(nid); |
| 1146 | return dest_cpu; | 1147 | |
| 1148 | /* Look for allowed, online CPU in same node. */ | ||
| 1149 | for_each_cpu(dest_cpu, nodemask) { | ||
| 1150 | if (!cpu_online(dest_cpu)) | ||
| 1151 | continue; | ||
| 1152 | if (!cpu_active(dest_cpu)) | ||
| 1153 | continue; | ||
| 1154 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | ||
| 1155 | return dest_cpu; | ||
| 1156 | } | ||
| 1147 | } | 1157 | } |
| 1148 | 1158 | ||
| 1149 | for (;;) { | 1159 | for (;;) { |
| @@ -1523,7 +1533,8 @@ out: | |||
| 1523 | */ | 1533 | */ |
| 1524 | int wake_up_process(struct task_struct *p) | 1534 | int wake_up_process(struct task_struct *p) |
| 1525 | { | 1535 | { |
| 1526 | return try_to_wake_up(p, TASK_ALL, 0); | 1536 | WARN_ON(task_is_stopped_or_traced(p)); |
| 1537 | return try_to_wake_up(p, TASK_NORMAL, 0); | ||
| 1527 | } | 1538 | } |
| 1528 | EXPORT_SYMBOL(wake_up_process); | 1539 | EXPORT_SYMBOL(wake_up_process); |
| 1529 | 1540 | ||
| @@ -1741,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister); | |||
| 1741 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 1752 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
| 1742 | { | 1753 | { |
| 1743 | struct preempt_notifier *notifier; | 1754 | struct preempt_notifier *notifier; |
| 1744 | struct hlist_node *node; | ||
| 1745 | 1755 | ||
| 1746 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 1756 | hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) |
| 1747 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); | 1757 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); |
| 1748 | } | 1758 | } |
| 1749 | 1759 | ||
| @@ -1752,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
| 1752 | struct task_struct *next) | 1762 | struct task_struct *next) |
| 1753 | { | 1763 | { |
| 1754 | struct preempt_notifier *notifier; | 1764 | struct preempt_notifier *notifier; |
| 1755 | struct hlist_node *node; | ||
| 1756 | 1765 | ||
| 1757 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 1766 | hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) |
| 1758 | notifier->ops->sched_out(notifier, next); | 1767 | notifier->ops->sched_out(notifier, next); |
| 1759 | } | 1768 | } |
| 1760 | 1769 | ||
| @@ -1968,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 1968 | } | 1977 | } |
| 1969 | 1978 | ||
| 1970 | /* | 1979 | /* |
| 1971 | * nr_running, nr_uninterruptible and nr_context_switches: | 1980 | * nr_running and nr_context_switches: |
| 1972 | * | 1981 | * |
| 1973 | * externally visible scheduler statistics: current number of runnable | 1982 | * externally visible scheduler statistics: current number of runnable |
| 1974 | * threads, current number of uninterruptible-sleeping threads, total | 1983 | * threads, total number of context switches performed since bootup. |
| 1975 | * number of context switches performed since bootup. | ||
| 1976 | */ | 1984 | */ |
| 1977 | unsigned long nr_running(void) | 1985 | unsigned long nr_running(void) |
| 1978 | { | 1986 | { |
| @@ -1984,23 +1992,6 @@ unsigned long nr_running(void) | |||
| 1984 | return sum; | 1992 | return sum; |
| 1985 | } | 1993 | } |
| 1986 | 1994 | ||
| 1987 | unsigned long nr_uninterruptible(void) | ||
| 1988 | { | ||
| 1989 | unsigned long i, sum = 0; | ||
| 1990 | |||
| 1991 | for_each_possible_cpu(i) | ||
| 1992 | sum += cpu_rq(i)->nr_uninterruptible; | ||
| 1993 | |||
| 1994 | /* | ||
| 1995 | * Since we read the counters lockless, it might be slightly | ||
| 1996 | * inaccurate. Do not allow it to go below zero though: | ||
| 1997 | */ | ||
| 1998 | if (unlikely((long)sum < 0)) | ||
| 1999 | sum = 0; | ||
| 2000 | |||
| 2001 | return sum; | ||
| 2002 | } | ||
| 2003 | |||
| 2004 | unsigned long long nr_context_switches(void) | 1995 | unsigned long long nr_context_switches(void) |
| 2005 | { | 1996 | { |
| 2006 | int i; | 1997 | int i; |
| @@ -2785,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 2785 | if (irqs_disabled()) | 2776 | if (irqs_disabled()) |
| 2786 | print_irqtrace_events(prev); | 2777 | print_irqtrace_events(prev); |
| 2787 | dump_stack(); | 2778 | dump_stack(); |
| 2788 | add_taint(TAINT_WARN); | 2779 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
| 2789 | } | 2780 | } |
| 2790 | 2781 | ||
| 2791 | /* | 2782 | /* |
| @@ -4410,20 +4401,32 @@ EXPORT_SYMBOL(yield); | |||
| 4410 | * It's the caller's job to ensure that the target task struct | 4401 | * It's the caller's job to ensure that the target task struct |
| 4411 | * can't go away on us before we can do any checks. | 4402 | * can't go away on us before we can do any checks. |
| 4412 | * | 4403 | * |
| 4413 | * Returns true if we indeed boosted the target task. | 4404 | * Returns: |
| 4405 | * true (>0) if we indeed boosted the target task. | ||
| 4406 | * false (0) if we failed to boost the target. | ||
| 4407 | * -ESRCH if there's no task to yield to. | ||
| 4414 | */ | 4408 | */ |
| 4415 | bool __sched yield_to(struct task_struct *p, bool preempt) | 4409 | bool __sched yield_to(struct task_struct *p, bool preempt) |
| 4416 | { | 4410 | { |
| 4417 | struct task_struct *curr = current; | 4411 | struct task_struct *curr = current; |
| 4418 | struct rq *rq, *p_rq; | 4412 | struct rq *rq, *p_rq; |
| 4419 | unsigned long flags; | 4413 | unsigned long flags; |
| 4420 | bool yielded = 0; | 4414 | int yielded = 0; |
| 4421 | 4415 | ||
| 4422 | local_irq_save(flags); | 4416 | local_irq_save(flags); |
| 4423 | rq = this_rq(); | 4417 | rq = this_rq(); |
| 4424 | 4418 | ||
| 4425 | again: | 4419 | again: |
| 4426 | p_rq = task_rq(p); | 4420 | p_rq = task_rq(p); |
| 4421 | /* | ||
| 4422 | * If we're the only runnable task on the rq and target rq also | ||
| 4423 | * has only one task, there's absolutely no point in yielding. | ||
| 4424 | */ | ||
| 4425 | if (rq->nr_running == 1 && p_rq->nr_running == 1) { | ||
| 4426 | yielded = -ESRCH; | ||
| 4427 | goto out_irq; | ||
| 4428 | } | ||
| 4429 | |||
| 4427 | double_rq_lock(rq, p_rq); | 4430 | double_rq_lock(rq, p_rq); |
| 4428 | while (task_rq(p) != p_rq) { | 4431 | while (task_rq(p) != p_rq) { |
| 4429 | double_rq_unlock(rq, p_rq); | 4432 | double_rq_unlock(rq, p_rq); |
| @@ -4431,13 +4434,13 @@ again: | |||
| 4431 | } | 4434 | } |
| 4432 | 4435 | ||
| 4433 | if (!curr->sched_class->yield_to_task) | 4436 | if (!curr->sched_class->yield_to_task) |
| 4434 | goto out; | 4437 | goto out_unlock; |
| 4435 | 4438 | ||
| 4436 | if (curr->sched_class != p->sched_class) | 4439 | if (curr->sched_class != p->sched_class) |
| 4437 | goto out; | 4440 | goto out_unlock; |
| 4438 | 4441 | ||
| 4439 | if (task_running(p_rq, p) || p->state) | 4442 | if (task_running(p_rq, p) || p->state) |
| 4440 | goto out; | 4443 | goto out_unlock; |
| 4441 | 4444 | ||
| 4442 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | 4445 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); |
| 4443 | if (yielded) { | 4446 | if (yielded) { |
| @@ -4450,11 +4453,12 @@ again: | |||
| 4450 | resched_task(p_rq->curr); | 4453 | resched_task(p_rq->curr); |
| 4451 | } | 4454 | } |
| 4452 | 4455 | ||
| 4453 | out: | 4456 | out_unlock: |
| 4454 | double_rq_unlock(rq, p_rq); | 4457 | double_rq_unlock(rq, p_rq); |
| 4458 | out_irq: | ||
| 4455 | local_irq_restore(flags); | 4459 | local_irq_restore(flags); |
| 4456 | 4460 | ||
| 4457 | if (yielded) | 4461 | if (yielded > 0) |
| 4458 | schedule(); | 4462 | schedule(); |
| 4459 | 4463 | ||
| 4460 | return yielded; | 4464 | return yielded; |
| @@ -4713,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4713 | */ | 4717 | */ |
| 4714 | idle->sched_class = &idle_sched_class; | 4718 | idle->sched_class = &idle_sched_class; |
| 4715 | ftrace_graph_init_idle_task(idle, cpu); | 4719 | ftrace_graph_init_idle_task(idle, cpu); |
| 4720 | vtime_init_idle(idle); | ||
| 4716 | #if defined(CONFIG_SMP) | 4721 | #if defined(CONFIG_SMP) |
| 4717 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); | 4722 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); |
| 4718 | #endif | 4723 | #endif |
| @@ -7206,7 +7211,6 @@ static void free_sched_group(struct task_group *tg) | |||
| 7206 | struct task_group *sched_create_group(struct task_group *parent) | 7211 | struct task_group *sched_create_group(struct task_group *parent) |
| 7207 | { | 7212 | { |
| 7208 | struct task_group *tg; | 7213 | struct task_group *tg; |
| 7209 | unsigned long flags; | ||
| 7210 | 7214 | ||
| 7211 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 7215 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
| 7212 | if (!tg) | 7216 | if (!tg) |
| @@ -7218,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
| 7218 | if (!alloc_rt_sched_group(tg, parent)) | 7222 | if (!alloc_rt_sched_group(tg, parent)) |
| 7219 | goto err; | 7223 | goto err; |
| 7220 | 7224 | ||
| 7225 | return tg; | ||
| 7226 | |||
| 7227 | err: | ||
| 7228 | free_sched_group(tg); | ||
| 7229 | return ERR_PTR(-ENOMEM); | ||
| 7230 | } | ||
| 7231 | |||
| 7232 | void sched_online_group(struct task_group *tg, struct task_group *parent) | ||
| 7233 | { | ||
| 7234 | unsigned long flags; | ||
| 7235 | |||
| 7221 | spin_lock_irqsave(&task_group_lock, flags); | 7236 | spin_lock_irqsave(&task_group_lock, flags); |
| 7222 | list_add_rcu(&tg->list, &task_groups); | 7237 | list_add_rcu(&tg->list, &task_groups); |
| 7223 | 7238 | ||
| @@ -7227,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
| 7227 | INIT_LIST_HEAD(&tg->children); | 7242 | INIT_LIST_HEAD(&tg->children); |
| 7228 | list_add_rcu(&tg->siblings, &parent->children); | 7243 | list_add_rcu(&tg->siblings, &parent->children); |
| 7229 | spin_unlock_irqrestore(&task_group_lock, flags); | 7244 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7230 | |||
| 7231 | return tg; | ||
| 7232 | |||
| 7233 | err: | ||
| 7234 | free_sched_group(tg); | ||
| 7235 | return ERR_PTR(-ENOMEM); | ||
| 7236 | } | 7245 | } |
| 7237 | 7246 | ||
| 7238 | /* rcu callback to free various structures associated with a task group */ | 7247 | /* rcu callback to free various structures associated with a task group */ |
| @@ -7245,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) | |||
| 7245 | /* Destroy runqueue etc associated with a task group */ | 7254 | /* Destroy runqueue etc associated with a task group */ |
| 7246 | void sched_destroy_group(struct task_group *tg) | 7255 | void sched_destroy_group(struct task_group *tg) |
| 7247 | { | 7256 | { |
| 7257 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
| 7258 | call_rcu(&tg->rcu, free_sched_group_rcu); | ||
| 7259 | } | ||
| 7260 | |||
| 7261 | void sched_offline_group(struct task_group *tg) | ||
| 7262 | { | ||
| 7248 | unsigned long flags; | 7263 | unsigned long flags; |
| 7249 | int i; | 7264 | int i; |
| 7250 | 7265 | ||
| @@ -7256,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg) | |||
| 7256 | list_del_rcu(&tg->list); | 7271 | list_del_rcu(&tg->list); |
| 7257 | list_del_rcu(&tg->siblings); | 7272 | list_del_rcu(&tg->siblings); |
| 7258 | spin_unlock_irqrestore(&task_group_lock, flags); | 7273 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7259 | |||
| 7260 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
| 7261 | call_rcu(&tg->rcu, free_sched_group_rcu); | ||
| 7262 | } | 7274 | } |
| 7263 | 7275 | ||
| 7264 | /* change task's runqueue when it moves between groups. | 7276 | /* change task's runqueue when it moves between groups. |
| @@ -7554,6 +7566,25 @@ static int sched_rt_global_constraints(void) | |||
| 7554 | } | 7566 | } |
| 7555 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7567 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7556 | 7568 | ||
| 7569 | int sched_rr_handler(struct ctl_table *table, int write, | ||
| 7570 | void __user *buffer, size_t *lenp, | ||
| 7571 | loff_t *ppos) | ||
| 7572 | { | ||
| 7573 | int ret; | ||
| 7574 | static DEFINE_MUTEX(mutex); | ||
| 7575 | |||
| 7576 | mutex_lock(&mutex); | ||
| 7577 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 7578 | /* make sure that internally we keep jiffies */ | ||
| 7579 | /* also, writing zero resets timeslice to default */ | ||
| 7580 | if (!ret && write) { | ||
| 7581 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | ||
| 7582 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | ||
| 7583 | } | ||
| 7584 | mutex_unlock(&mutex); | ||
| 7585 | return ret; | ||
| 7586 | } | ||
| 7587 | |||
| 7557 | int sched_rt_handler(struct ctl_table *table, int write, | 7588 | int sched_rt_handler(struct ctl_table *table, int write, |
| 7558 | void __user *buffer, size_t *lenp, | 7589 | void __user *buffer, size_t *lenp, |
| 7559 | loff_t *ppos) | 7590 | loff_t *ppos) |
| @@ -7610,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | |||
| 7610 | return &tg->css; | 7641 | return &tg->css; |
| 7611 | } | 7642 | } |
| 7612 | 7643 | ||
| 7644 | static int cpu_cgroup_css_online(struct cgroup *cgrp) | ||
| 7645 | { | ||
| 7646 | struct task_group *tg = cgroup_tg(cgrp); | ||
| 7647 | struct task_group *parent; | ||
| 7648 | |||
| 7649 | if (!cgrp->parent) | ||
| 7650 | return 0; | ||
| 7651 | |||
| 7652 | parent = cgroup_tg(cgrp->parent); | ||
| 7653 | sched_online_group(tg, parent); | ||
| 7654 | return 0; | ||
| 7655 | } | ||
| 7656 | |||
| 7613 | static void cpu_cgroup_css_free(struct cgroup *cgrp) | 7657 | static void cpu_cgroup_css_free(struct cgroup *cgrp) |
| 7614 | { | 7658 | { |
| 7615 | struct task_group *tg = cgroup_tg(cgrp); | 7659 | struct task_group *tg = cgroup_tg(cgrp); |
| @@ -7617,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) | |||
| 7617 | sched_destroy_group(tg); | 7661 | sched_destroy_group(tg); |
| 7618 | } | 7662 | } |
| 7619 | 7663 | ||
| 7664 | static void cpu_cgroup_css_offline(struct cgroup *cgrp) | ||
| 7665 | { | ||
| 7666 | struct task_group *tg = cgroup_tg(cgrp); | ||
| 7667 | |||
| 7668 | sched_offline_group(tg); | ||
| 7669 | } | ||
| 7670 | |||
| 7620 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, | 7671 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, |
| 7621 | struct cgroup_taskset *tset) | 7672 | struct cgroup_taskset *tset) |
| 7622 | { | 7673 | { |
| @@ -7972,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 7972 | .name = "cpu", | 8023 | .name = "cpu", |
| 7973 | .css_alloc = cpu_cgroup_css_alloc, | 8024 | .css_alloc = cpu_cgroup_css_alloc, |
| 7974 | .css_free = cpu_cgroup_css_free, | 8025 | .css_free = cpu_cgroup_css_free, |
| 8026 | .css_online = cpu_cgroup_css_online, | ||
| 8027 | .css_offline = cpu_cgroup_css_offline, | ||
| 7975 | .can_attach = cpu_cgroup_can_attach, | 8028 | .can_attach = cpu_cgroup_can_attach, |
| 7976 | .attach = cpu_cgroup_attach, | 8029 | .attach = cpu_cgroup_attach, |
| 7977 | .exit = cpu_cgroup_exit, | 8030 | .exit = cpu_cgroup_exit, |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 23aa789c53ee..1095e878a46f 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
| @@ -28,6 +28,8 @@ | |||
| 28 | */ | 28 | */ |
| 29 | 29 | ||
| 30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
| 31 | #include <linux/sched.h> | ||
| 32 | #include <linux/sched/rt.h> | ||
| 31 | #include "cpupri.h" | 33 | #include "cpupri.h" |
| 32 | 34 | ||
| 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 35 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 293b202fcf79..ed12cbb135f4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | #include <linux/tsacct_kern.h> | 3 | #include <linux/tsacct_kern.h> |
| 4 | #include <linux/kernel_stat.h> | 4 | #include <linux/kernel_stat.h> |
| 5 | #include <linux/static_key.h> | 5 | #include <linux/static_key.h> |
| 6 | #include <linux/context_tracking.h> | ||
| 6 | #include "sched.h" | 7 | #include "sched.h" |
| 7 | 8 | ||
| 8 | 9 | ||
| @@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
| 163 | task_group_account_field(p, index, (__force u64) cputime); | 164 | task_group_account_field(p, index, (__force u64) cputime); |
| 164 | 165 | ||
| 165 | /* Account for user time used */ | 166 | /* Account for user time used */ |
| 166 | acct_update_integrals(p); | 167 | acct_account_cputime(p); |
| 167 | } | 168 | } |
| 168 | 169 | ||
| 169 | /* | 170 | /* |
| @@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
| 213 | task_group_account_field(p, index, (__force u64) cputime); | 214 | task_group_account_field(p, index, (__force u64) cputime); |
| 214 | 215 | ||
| 215 | /* Account for system time used */ | 216 | /* Account for system time used */ |
| 216 | acct_update_integrals(p); | 217 | acct_account_cputime(p); |
| 217 | } | 218 | } |
| 218 | 219 | ||
| 219 | /* | 220 | /* |
| @@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void) | |||
| 295 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 296 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
| 296 | { | 297 | { |
| 297 | struct signal_struct *sig = tsk->signal; | 298 | struct signal_struct *sig = tsk->signal; |
| 299 | cputime_t utime, stime; | ||
| 298 | struct task_struct *t; | 300 | struct task_struct *t; |
| 299 | 301 | ||
| 300 | times->utime = sig->utime; | 302 | times->utime = sig->utime; |
| @@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 308 | 310 | ||
| 309 | t = tsk; | 311 | t = tsk; |
| 310 | do { | 312 | do { |
| 311 | times->utime += t->utime; | 313 | task_cputime(tsk, &utime, &stime); |
| 312 | times->stime += t->stime; | 314 | times->utime += utime; |
| 315 | times->stime += stime; | ||
| 313 | times->sum_exec_runtime += task_sched_runtime(t); | 316 | times->sum_exec_runtime += task_sched_runtime(t); |
| 314 | } while_each_thread(tsk, t); | 317 | } while_each_thread(tsk, t); |
| 315 | out: | 318 | out: |
| 316 | rcu_read_unlock(); | 319 | rcu_read_unlock(); |
| 317 | } | 320 | } |
| 318 | 321 | ||
| 319 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 320 | |||
| 321 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 322 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 322 | /* | 323 | /* |
| 323 | * Account a tick to a process and cpustat | 324 | * Account a tick to a process and cpustat |
| @@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks) | |||
| 382 | irqtime_account_process_tick(current, 0, rq); | 383 | irqtime_account_process_tick(current, 0, rq); |
| 383 | } | 384 | } |
| 384 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 385 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 385 | static void irqtime_account_idle_ticks(int ticks) {} | 386 | static inline void irqtime_account_idle_ticks(int ticks) {} |
| 386 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 387 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
| 387 | struct rq *rq) {} | 388 | struct rq *rq) {} |
| 388 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 389 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 389 | 390 | ||
| 391 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
| 390 | /* | 392 | /* |
| 391 | * Account a single tick of cpu time. | 393 | * Account a single tick of cpu time. |
| 392 | * @p: the process that the cpu time gets accounted to | 394 | * @p: the process that the cpu time gets accounted to |
| @@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
| 397 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 399 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
| 398 | struct rq *rq = this_rq(); | 400 | struct rq *rq = this_rq(); |
| 399 | 401 | ||
| 402 | if (vtime_accounting_enabled()) | ||
| 403 | return; | ||
| 404 | |||
| 400 | if (sched_clock_irqtime) { | 405 | if (sched_clock_irqtime) { |
| 401 | irqtime_account_process_tick(p, user_tick, rq); | 406 | irqtime_account_process_tick(p, user_tick, rq); |
| 402 | return; | 407 | return; |
| @@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks) | |||
| 438 | 443 | ||
| 439 | account_idle_time(jiffies_to_cputime(ticks)); | 444 | account_idle_time(jiffies_to_cputime(ticks)); |
| 440 | } | 445 | } |
| 441 | 446 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | |
| 442 | #endif | ||
| 443 | 447 | ||
| 444 | /* | 448 | /* |
| 445 | * Use precise platform statistics if available: | 449 | * Use precise platform statistics if available: |
| @@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
| 461 | *st = cputime.stime; | 465 | *st = cputime.stime; |
| 462 | } | 466 | } |
| 463 | 467 | ||
| 464 | void vtime_account_system_irqsafe(struct task_struct *tsk) | ||
| 465 | { | ||
| 466 | unsigned long flags; | ||
| 467 | |||
| 468 | local_irq_save(flags); | ||
| 469 | vtime_account_system(tsk); | ||
| 470 | local_irq_restore(flags); | ||
| 471 | } | ||
| 472 | EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); | ||
| 473 | |||
| 474 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | 468 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
| 475 | void vtime_task_switch(struct task_struct *prev) | 469 | void vtime_task_switch(struct task_struct *prev) |
| 476 | { | 470 | { |
| 471 | if (!vtime_accounting_enabled()) | ||
| 472 | return; | ||
| 473 | |||
| 477 | if (is_idle_task(prev)) | 474 | if (is_idle_task(prev)) |
| 478 | vtime_account_idle(prev); | 475 | vtime_account_idle(prev); |
| 479 | else | 476 | else |
| 480 | vtime_account_system(prev); | 477 | vtime_account_system(prev); |
| 481 | 478 | ||
| 479 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
| 482 | vtime_account_user(prev); | 480 | vtime_account_user(prev); |
| 481 | #endif | ||
| 483 | arch_vtime_task_switch(prev); | 482 | arch_vtime_task_switch(prev); |
| 484 | } | 483 | } |
| 485 | #endif | 484 | #endif |
| @@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev) | |||
| 493 | * vtime_account(). | 492 | * vtime_account(). |
| 494 | */ | 493 | */ |
| 495 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 494 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
| 496 | void vtime_account(struct task_struct *tsk) | 495 | void vtime_account_irq_enter(struct task_struct *tsk) |
| 497 | { | 496 | { |
| 498 | if (in_interrupt() || !is_idle_task(tsk)) | 497 | if (!vtime_accounting_enabled()) |
| 499 | vtime_account_system(tsk); | 498 | return; |
| 500 | else | 499 | |
| 501 | vtime_account_idle(tsk); | 500 | if (!in_interrupt()) { |
| 501 | /* | ||
| 502 | * If we interrupted user, context_tracking_in_user() | ||
| 503 | * is 1 because the context tracking don't hook | ||
| 504 | * on irq entry/exit. This way we know if | ||
| 505 | * we need to flush user time on kernel entry. | ||
| 506 | */ | ||
| 507 | if (context_tracking_in_user()) { | ||
| 508 | vtime_account_user(tsk); | ||
| 509 | return; | ||
| 510 | } | ||
| 511 | |||
| 512 | if (is_idle_task(tsk)) { | ||
| 513 | vtime_account_idle(tsk); | ||
| 514 | return; | ||
| 515 | } | ||
| 516 | } | ||
| 517 | vtime_account_system(tsk); | ||
| 502 | } | 518 | } |
| 503 | EXPORT_SYMBOL_GPL(vtime_account); | 519 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
| 504 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 520 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
| 505 | 521 | ||
| 506 | #else | 522 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ |
| 507 | |||
| 508 | #ifndef nsecs_to_cputime | ||
| 509 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 510 | #endif | ||
| 511 | 523 | ||
| 512 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | 524 | static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) |
| 513 | { | 525 | { |
| 514 | u64 temp = (__force u64) rtime; | 526 | u64 temp = (__force u64) rtime; |
| 515 | 527 | ||
| 516 | temp *= (__force u64) utime; | 528 | temp *= (__force u64) stime; |
| 517 | 529 | ||
| 518 | if (sizeof(cputime_t) == 4) | 530 | if (sizeof(cputime_t) == 4) |
| 519 | temp = div_u64(temp, (__force u32) total); | 531 | temp = div_u64(temp, (__force u32) total); |
| @@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 531 | struct cputime *prev, | 543 | struct cputime *prev, |
| 532 | cputime_t *ut, cputime_t *st) | 544 | cputime_t *ut, cputime_t *st) |
| 533 | { | 545 | { |
| 534 | cputime_t rtime, utime, total; | 546 | cputime_t rtime, stime, total; |
| 535 | 547 | ||
| 536 | utime = curr->utime; | 548 | stime = curr->stime; |
| 537 | total = utime + curr->stime; | 549 | total = stime + curr->utime; |
| 538 | 550 | ||
| 539 | /* | 551 | /* |
| 540 | * Tick based cputime accounting depend on random scheduling | 552 | * Tick based cputime accounting depend on random scheduling |
| @@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 549 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | 561 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
| 550 | 562 | ||
| 551 | if (total) | 563 | if (total) |
| 552 | utime = scale_utime(utime, rtime, total); | 564 | stime = scale_stime(stime, rtime, total); |
| 553 | else | 565 | else |
| 554 | utime = rtime; | 566 | stime = rtime; |
| 555 | 567 | ||
| 556 | /* | 568 | /* |
| 557 | * If the tick based count grows faster than the scheduler one, | 569 | * If the tick based count grows faster than the scheduler one, |
| 558 | * the result of the scaling may go backward. | 570 | * the result of the scaling may go backward. |
| 559 | * Let's enforce monotonicity. | 571 | * Let's enforce monotonicity. |
| 560 | */ | 572 | */ |
| 561 | prev->utime = max(prev->utime, utime); | 573 | prev->stime = max(prev->stime, stime); |
| 562 | prev->stime = max(prev->stime, rtime - prev->utime); | 574 | prev->utime = max(prev->utime, rtime - prev->stime); |
| 563 | 575 | ||
| 564 | *ut = prev->utime; | 576 | *ut = prev->utime; |
| 565 | *st = prev->stime; | 577 | *st = prev->stime; |
| @@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 568 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 580 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 569 | { | 581 | { |
| 570 | struct task_cputime cputime = { | 582 | struct task_cputime cputime = { |
| 571 | .utime = p->utime, | ||
| 572 | .stime = p->stime, | ||
| 573 | .sum_exec_runtime = p->se.sum_exec_runtime, | 583 | .sum_exec_runtime = p->se.sum_exec_runtime, |
| 574 | }; | 584 | }; |
| 575 | 585 | ||
| 586 | task_cputime(p, &cputime.utime, &cputime.stime); | ||
| 576 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 587 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
| 577 | } | 588 | } |
| 578 | 589 | ||
| @@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
| 586 | thread_group_cputime(p, &cputime); | 597 | thread_group_cputime(p, &cputime); |
| 587 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | 598 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); |
| 588 | } | 599 | } |
| 589 | #endif | 600 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ |
| 601 | |||
| 602 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | ||
| 603 | static unsigned long long vtime_delta(struct task_struct *tsk) | ||
| 604 | { | ||
| 605 | unsigned long long clock; | ||
| 606 | |||
| 607 | clock = local_clock(); | ||
| 608 | if (clock < tsk->vtime_snap) | ||
| 609 | return 0; | ||
| 610 | |||
| 611 | return clock - tsk->vtime_snap; | ||
| 612 | } | ||
| 613 | |||
| 614 | static cputime_t get_vtime_delta(struct task_struct *tsk) | ||
| 615 | { | ||
| 616 | unsigned long long delta = vtime_delta(tsk); | ||
| 617 | |||
| 618 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); | ||
| 619 | tsk->vtime_snap += delta; | ||
| 620 | |||
| 621 | /* CHECKME: always safe to convert nsecs to cputime? */ | ||
| 622 | return nsecs_to_cputime(delta); | ||
| 623 | } | ||
| 624 | |||
| 625 | static void __vtime_account_system(struct task_struct *tsk) | ||
| 626 | { | ||
| 627 | cputime_t delta_cpu = get_vtime_delta(tsk); | ||
| 628 | |||
| 629 | account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); | ||
| 630 | } | ||
| 631 | |||
| 632 | void vtime_account_system(struct task_struct *tsk) | ||
| 633 | { | ||
| 634 | if (!vtime_accounting_enabled()) | ||
| 635 | return; | ||
| 636 | |||
| 637 | write_seqlock(&tsk->vtime_seqlock); | ||
| 638 | __vtime_account_system(tsk); | ||
| 639 | write_sequnlock(&tsk->vtime_seqlock); | ||
| 640 | } | ||
| 641 | |||
| 642 | void vtime_account_irq_exit(struct task_struct *tsk) | ||
| 643 | { | ||
| 644 | if (!vtime_accounting_enabled()) | ||
| 645 | return; | ||
| 646 | |||
| 647 | write_seqlock(&tsk->vtime_seqlock); | ||
| 648 | if (context_tracking_in_user()) | ||
| 649 | tsk->vtime_snap_whence = VTIME_USER; | ||
| 650 | __vtime_account_system(tsk); | ||
| 651 | write_sequnlock(&tsk->vtime_seqlock); | ||
| 652 | } | ||
| 653 | |||
| 654 | void vtime_account_user(struct task_struct *tsk) | ||
| 655 | { | ||
| 656 | cputime_t delta_cpu; | ||
| 657 | |||
| 658 | if (!vtime_accounting_enabled()) | ||
| 659 | return; | ||
| 660 | |||
| 661 | delta_cpu = get_vtime_delta(tsk); | ||
| 662 | |||
| 663 | write_seqlock(&tsk->vtime_seqlock); | ||
| 664 | tsk->vtime_snap_whence = VTIME_SYS; | ||
| 665 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | ||
| 666 | write_sequnlock(&tsk->vtime_seqlock); | ||
| 667 | } | ||
| 668 | |||
| 669 | void vtime_user_enter(struct task_struct *tsk) | ||
| 670 | { | ||
| 671 | if (!vtime_accounting_enabled()) | ||
| 672 | return; | ||
| 673 | |||
| 674 | write_seqlock(&tsk->vtime_seqlock); | ||
| 675 | tsk->vtime_snap_whence = VTIME_USER; | ||
| 676 | __vtime_account_system(tsk); | ||
| 677 | write_sequnlock(&tsk->vtime_seqlock); | ||
| 678 | } | ||
| 679 | |||
| 680 | void vtime_guest_enter(struct task_struct *tsk) | ||
| 681 | { | ||
| 682 | write_seqlock(&tsk->vtime_seqlock); | ||
| 683 | __vtime_account_system(tsk); | ||
| 684 | current->flags |= PF_VCPU; | ||
| 685 | write_sequnlock(&tsk->vtime_seqlock); | ||
| 686 | } | ||
| 687 | |||
| 688 | void vtime_guest_exit(struct task_struct *tsk) | ||
| 689 | { | ||
| 690 | write_seqlock(&tsk->vtime_seqlock); | ||
| 691 | __vtime_account_system(tsk); | ||
| 692 | current->flags &= ~PF_VCPU; | ||
| 693 | write_sequnlock(&tsk->vtime_seqlock); | ||
| 694 | } | ||
| 695 | |||
| 696 | void vtime_account_idle(struct task_struct *tsk) | ||
| 697 | { | ||
| 698 | cputime_t delta_cpu = get_vtime_delta(tsk); | ||
| 699 | |||
| 700 | account_idle_time(delta_cpu); | ||
| 701 | } | ||
| 702 | |||
| 703 | bool vtime_accounting_enabled(void) | ||
| 704 | { | ||
| 705 | return context_tracking_active(); | ||
| 706 | } | ||
| 707 | |||
| 708 | void arch_vtime_task_switch(struct task_struct *prev) | ||
| 709 | { | ||
| 710 | write_seqlock(&prev->vtime_seqlock); | ||
| 711 | prev->vtime_snap_whence = VTIME_SLEEPING; | ||
| 712 | write_sequnlock(&prev->vtime_seqlock); | ||
| 713 | |||
| 714 | write_seqlock(¤t->vtime_seqlock); | ||
| 715 | current->vtime_snap_whence = VTIME_SYS; | ||
| 716 | current->vtime_snap = sched_clock(); | ||
| 717 | write_sequnlock(¤t->vtime_seqlock); | ||
| 718 | } | ||
| 719 | |||
| 720 | void vtime_init_idle(struct task_struct *t) | ||
| 721 | { | ||
| 722 | unsigned long flags; | ||
| 723 | |||
| 724 | write_seqlock_irqsave(&t->vtime_seqlock, flags); | ||
| 725 | t->vtime_snap_whence = VTIME_SYS; | ||
| 726 | t->vtime_snap = sched_clock(); | ||
| 727 | write_sequnlock_irqrestore(&t->vtime_seqlock, flags); | ||
| 728 | } | ||
| 729 | |||
| 730 | cputime_t task_gtime(struct task_struct *t) | ||
| 731 | { | ||
| 732 | unsigned int seq; | ||
| 733 | cputime_t gtime; | ||
| 734 | |||
| 735 | do { | ||
| 736 | seq = read_seqbegin(&t->vtime_seqlock); | ||
| 737 | |||
| 738 | gtime = t->gtime; | ||
| 739 | if (t->flags & PF_VCPU) | ||
| 740 | gtime += vtime_delta(t); | ||
| 741 | |||
| 742 | } while (read_seqretry(&t->vtime_seqlock, seq)); | ||
| 743 | |||
| 744 | return gtime; | ||
| 745 | } | ||
| 746 | |||
| 747 | /* | ||
| 748 | * Fetch cputime raw values from fields of task_struct and | ||
| 749 | * add up the pending nohz execution time since the last | ||
| 750 | * cputime snapshot. | ||
| 751 | */ | ||
| 752 | static void | ||
| 753 | fetch_task_cputime(struct task_struct *t, | ||
| 754 | cputime_t *u_dst, cputime_t *s_dst, | ||
| 755 | cputime_t *u_src, cputime_t *s_src, | ||
| 756 | cputime_t *udelta, cputime_t *sdelta) | ||
| 757 | { | ||
| 758 | unsigned int seq; | ||
| 759 | unsigned long long delta; | ||
| 760 | |||
| 761 | do { | ||
| 762 | *udelta = 0; | ||
| 763 | *sdelta = 0; | ||
| 764 | |||
| 765 | seq = read_seqbegin(&t->vtime_seqlock); | ||
| 766 | |||
| 767 | if (u_dst) | ||
| 768 | *u_dst = *u_src; | ||
| 769 | if (s_dst) | ||
| 770 | *s_dst = *s_src; | ||
| 771 | |||
| 772 | /* Task is sleeping, nothing to add */ | ||
| 773 | if (t->vtime_snap_whence == VTIME_SLEEPING || | ||
| 774 | is_idle_task(t)) | ||
| 775 | continue; | ||
| 776 | |||
| 777 | delta = vtime_delta(t); | ||
| 778 | |||
| 779 | /* | ||
| 780 | * Task runs either in user or kernel space, add pending nohz time to | ||
| 781 | * the right place. | ||
| 782 | */ | ||
| 783 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { | ||
| 784 | *udelta = delta; | ||
| 785 | } else { | ||
| 786 | if (t->vtime_snap_whence == VTIME_SYS) | ||
| 787 | *sdelta = delta; | ||
| 788 | } | ||
| 789 | } while (read_seqretry(&t->vtime_seqlock, seq)); | ||
| 790 | } | ||
| 791 | |||
| 792 | |||
| 793 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | ||
| 794 | { | ||
| 795 | cputime_t udelta, sdelta; | ||
| 796 | |||
| 797 | fetch_task_cputime(t, utime, stime, &t->utime, | ||
| 798 | &t->stime, &udelta, &sdelta); | ||
| 799 | if (utime) | ||
| 800 | *utime += udelta; | ||
| 801 | if (stime) | ||
| 802 | *stime += sdelta; | ||
| 803 | } | ||
| 804 | |||
| 805 | void task_cputime_scaled(struct task_struct *t, | ||
| 806 | cputime_t *utimescaled, cputime_t *stimescaled) | ||
| 807 | { | ||
| 808 | cputime_t udelta, sdelta; | ||
| 809 | |||
| 810 | fetch_task_cputime(t, utimescaled, stimescaled, | ||
| 811 | &t->utimescaled, &t->stimescaled, &udelta, &sdelta); | ||
| 812 | if (utimescaled) | ||
| 813 | *utimescaled += cputime_to_scaled(udelta); | ||
| 814 | if (stimescaled) | ||
| 815 | *stimescaled += cputime_to_scaled(sdelta); | ||
| 816 | } | ||
| 817 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2cd3c1b4e582..75024a673520 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg) | |||
| 110 | if (autogroup_path(tg, group_path, PATH_MAX)) | 110 | if (autogroup_path(tg, group_path, PATH_MAX)) |
| 111 | return group_path; | 111 | return group_path; |
| 112 | 112 | ||
| 113 | /* | ||
| 114 | * May be NULL if the underlying cgroup isn't fully-created yet | ||
| 115 | */ | ||
| 116 | if (!tg->css.cgroup) { | ||
| 117 | group_path[0] = '\0'; | ||
| 118 | return group_path; | ||
| 119 | } | ||
| 120 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 113 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
| 121 | return group_path; | 114 | return group_path; |
| 122 | } | 115 | } |
| @@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 222 | cfs_rq->runnable_load_avg); | 215 | cfs_rq->runnable_load_avg); |
| 223 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", | 216 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", |
| 224 | cfs_rq->blocked_load_avg); | 217 | cfs_rq->blocked_load_avg); |
| 225 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", | 218 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", |
| 226 | atomic64_read(&cfs_rq->tg->load_avg)); | 219 | (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); |
| 227 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", | 220 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", |
| 228 | cfs_rq->tg_load_contrib); | 221 | cfs_rq->tg_load_contrib); |
| 229 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | 222 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", |
| @@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
| 269 | { | 262 | { |
| 270 | unsigned int freq = cpu_khz ? : 1; | 263 | unsigned int freq = cpu_khz ? : 1; |
| 271 | 264 | ||
| 272 | SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", | 265 | SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", |
| 273 | cpu, freq / 1000, (freq % 1000)); | 266 | cpu, freq / 1000, (freq % 1000)); |
| 274 | } | 267 | } |
| 275 | #else | 268 | #else |
| 276 | SEQ_printf(m, "\ncpu#%d\n", cpu); | 269 | SEQ_printf(m, "cpu#%d\n", cpu); |
| 277 | #endif | 270 | #endif |
| 278 | 271 | ||
| 279 | #define P(x) \ | 272 | #define P(x) \ |
| @@ -330,6 +323,7 @@ do { \ | |||
| 330 | print_rq(m, rq, cpu); | 323 | print_rq(m, rq, cpu); |
| 331 | rcu_read_unlock(); | 324 | rcu_read_unlock(); |
| 332 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 325 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
| 326 | SEQ_printf(m, "\n"); | ||
| 333 | } | 327 | } |
| 334 | 328 | ||
| 335 | static const char *sched_tunable_scaling_names[] = { | 329 | static const char *sched_tunable_scaling_names[] = { |
| @@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = { | |||
| 338 | "linear" | 332 | "linear" |
| 339 | }; | 333 | }; |
| 340 | 334 | ||
| 341 | static int sched_debug_show(struct seq_file *m, void *v) | 335 | static void sched_debug_header(struct seq_file *m) |
| 342 | { | 336 | { |
| 343 | u64 ktime, sched_clk, cpu_clk; | 337 | u64 ktime, sched_clk, cpu_clk; |
| 344 | unsigned long flags; | 338 | unsigned long flags; |
| 345 | int cpu; | ||
| 346 | 339 | ||
| 347 | local_irq_save(flags); | 340 | local_irq_save(flags); |
| 348 | ktime = ktime_to_ns(ktime_get()); | 341 | ktime = ktime_to_ns(ktime_get()); |
| @@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
| 384 | #undef PN | 377 | #undef PN |
| 385 | #undef P | 378 | #undef P |
| 386 | 379 | ||
| 387 | SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", | 380 | SEQ_printf(m, " .%-40s: %d (%s)\n", |
| 381 | "sysctl_sched_tunable_scaling", | ||
| 388 | sysctl_sched_tunable_scaling, | 382 | sysctl_sched_tunable_scaling, |
| 389 | sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); | 383 | sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); |
| 384 | SEQ_printf(m, "\n"); | ||
| 385 | } | ||
| 390 | 386 | ||
| 391 | for_each_online_cpu(cpu) | 387 | static int sched_debug_show(struct seq_file *m, void *v) |
| 392 | print_cpu(m, cpu); | 388 | { |
| 389 | int cpu = (unsigned long)(v - 2); | ||
| 393 | 390 | ||
| 394 | SEQ_printf(m, "\n"); | 391 | if (cpu != -1) |
| 392 | print_cpu(m, cpu); | ||
| 393 | else | ||
| 394 | sched_debug_header(m); | ||
| 395 | 395 | ||
| 396 | return 0; | 396 | return 0; |
| 397 | } | 397 | } |
| 398 | 398 | ||
| 399 | void sysrq_sched_debug_show(void) | 399 | void sysrq_sched_debug_show(void) |
| 400 | { | 400 | { |
| 401 | sched_debug_show(NULL, NULL); | 401 | int cpu; |
| 402 | |||
| 403 | sched_debug_header(NULL); | ||
| 404 | for_each_online_cpu(cpu) | ||
| 405 | print_cpu(NULL, cpu); | ||
| 406 | |||
| 407 | } | ||
| 408 | |||
| 409 | /* | ||
| 410 | * This itererator needs some explanation. | ||
| 411 | * It returns 1 for the header position. | ||
| 412 | * This means 2 is cpu 0. | ||
| 413 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | ||
| 414 | * to use cpumask_* to iterate over the cpus. | ||
| 415 | */ | ||
| 416 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) | ||
| 417 | { | ||
| 418 | unsigned long n = *offset; | ||
| 419 | |||
| 420 | if (n == 0) | ||
| 421 | return (void *) 1; | ||
| 422 | |||
| 423 | n--; | ||
| 424 | |||
| 425 | if (n > 0) | ||
| 426 | n = cpumask_next(n - 1, cpu_online_mask); | ||
| 427 | else | ||
| 428 | n = cpumask_first(cpu_online_mask); | ||
| 429 | |||
| 430 | *offset = n + 1; | ||
| 431 | |||
| 432 | if (n < nr_cpu_ids) | ||
| 433 | return (void *)(unsigned long)(n + 2); | ||
| 434 | return NULL; | ||
| 435 | } | ||
| 436 | |||
| 437 | static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) | ||
| 438 | { | ||
| 439 | (*offset)++; | ||
| 440 | return sched_debug_start(file, offset); | ||
| 441 | } | ||
| 442 | |||
| 443 | static void sched_debug_stop(struct seq_file *file, void *data) | ||
| 444 | { | ||
| 445 | } | ||
| 446 | |||
| 447 | static const struct seq_operations sched_debug_sops = { | ||
| 448 | .start = sched_debug_start, | ||
| 449 | .next = sched_debug_next, | ||
| 450 | .stop = sched_debug_stop, | ||
| 451 | .show = sched_debug_show, | ||
| 452 | }; | ||
| 453 | |||
| 454 | static int sched_debug_release(struct inode *inode, struct file *file) | ||
| 455 | { | ||
| 456 | seq_release(inode, file); | ||
| 457 | |||
| 458 | return 0; | ||
| 402 | } | 459 | } |
| 403 | 460 | ||
| 404 | static int sched_debug_open(struct inode *inode, struct file *filp) | 461 | static int sched_debug_open(struct inode *inode, struct file *filp) |
| 405 | { | 462 | { |
| 406 | return single_open(filp, sched_debug_show, NULL); | 463 | int ret = 0; |
| 464 | |||
| 465 | ret = seq_open(filp, &sched_debug_sops); | ||
| 466 | |||
| 467 | return ret; | ||
| 407 | } | 468 | } |
| 408 | 469 | ||
| 409 | static const struct file_operations sched_debug_fops = { | 470 | static const struct file_operations sched_debug_fops = { |
| 410 | .open = sched_debug_open, | 471 | .open = sched_debug_open, |
| 411 | .read = seq_read, | 472 | .read = seq_read, |
| 412 | .llseek = seq_lseek, | 473 | .llseek = seq_lseek, |
| 413 | .release = single_release, | 474 | .release = sched_debug_release, |
| 414 | }; | 475 | }; |
| 415 | 476 | ||
| 416 | static int __init init_sched_debug_procfs(void) | 477 | static int __init init_sched_debug_procfs(void) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eea8707234a..7a33e5986fc5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 1680 | } | 1680 | } |
| 1681 | 1681 | ||
| 1682 | /* ensure we never gain time by being placed backwards. */ | 1682 | /* ensure we never gain time by being placed backwards. */ |
| 1683 | vruntime = max_vruntime(se->vruntime, vruntime); | 1683 | se->vruntime = max_vruntime(se->vruntime, vruntime); |
| 1684 | |||
| 1685 | se->vruntime = vruntime; | ||
| 1686 | } | 1684 | } |
| 1687 | 1685 | ||
| 1688 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | 1686 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); |
| @@ -2663,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 2663 | hrtimer_cancel(&cfs_b->slack_timer); | 2661 | hrtimer_cancel(&cfs_b->slack_timer); |
| 2664 | } | 2662 | } |
| 2665 | 2663 | ||
| 2666 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | 2664 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) |
| 2667 | { | 2665 | { |
| 2668 | struct cfs_rq *cfs_rq; | 2666 | struct cfs_rq *cfs_rq; |
| 2669 | 2667 | ||
| @@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 3254 | */ | 3252 | */ |
| 3255 | static int select_idle_sibling(struct task_struct *p, int target) | 3253 | static int select_idle_sibling(struct task_struct *p, int target) |
| 3256 | { | 3254 | { |
| 3257 | int cpu = smp_processor_id(); | ||
| 3258 | int prev_cpu = task_cpu(p); | ||
| 3259 | struct sched_domain *sd; | 3255 | struct sched_domain *sd; |
| 3260 | struct sched_group *sg; | 3256 | struct sched_group *sg; |
| 3261 | int i; | 3257 | int i = task_cpu(p); |
| 3262 | 3258 | ||
| 3263 | /* | 3259 | if (idle_cpu(target)) |
| 3264 | * If the task is going to be woken-up on this cpu and if it is | 3260 | return target; |
| 3265 | * already idle, then it is the right target. | ||
| 3266 | */ | ||
| 3267 | if (target == cpu && idle_cpu(cpu)) | ||
| 3268 | return cpu; | ||
| 3269 | 3261 | ||
| 3270 | /* | 3262 | /* |
| 3271 | * If the task is going to be woken-up on the cpu where it previously | 3263 | * If the prevous cpu is cache affine and idle, don't be stupid. |
| 3272 | * ran and if it is currently idle, then it the right target. | ||
| 3273 | */ | 3264 | */ |
| 3274 | if (target == prev_cpu && idle_cpu(prev_cpu)) | 3265 | if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) |
| 3275 | return prev_cpu; | 3266 | return i; |
| 3276 | 3267 | ||
| 3277 | /* | 3268 | /* |
| 3278 | * Otherwise, iterate the domains and find an elegible idle cpu. | 3269 | * Otherwise, iterate the domains and find an elegible idle cpu. |
| @@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 3286 | goto next; | 3277 | goto next; |
| 3287 | 3278 | ||
| 3288 | for_each_cpu(i, sched_group_cpus(sg)) { | 3279 | for_each_cpu(i, sched_group_cpus(sg)) { |
| 3289 | if (!idle_cpu(i)) | 3280 | if (i == target || !idle_cpu(i)) |
| 3290 | goto next; | 3281 | goto next; |
| 3291 | } | 3282 | } |
| 3292 | 3283 | ||
| @@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | |||
| 6101 | * idle runqueue: | 6092 | * idle runqueue: |
| 6102 | */ | 6093 | */ |
| 6103 | if (rq->cfs.load.weight) | 6094 | if (rq->cfs.load.weight) |
| 6104 | rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | 6095 | rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); |
| 6105 | 6096 | ||
| 6106 | return rr_interval; | 6097 | return rr_interval; |
| 6107 | } | 6098 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 418feb01344e..127a2c4cf4ab 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -7,6 +7,8 @@ | |||
| 7 | 7 | ||
| 8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
| 9 | 9 | ||
| 10 | int sched_rr_timeslice = RR_TIMESLICE; | ||
| 11 | |||
| 10 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | 12 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); |
| 11 | 13 | ||
| 12 | struct rt_bandwidth def_rt_bandwidth; | 14 | struct rt_bandwidth def_rt_bandwidth; |
| @@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
| 566 | static int do_balance_runtime(struct rt_rq *rt_rq) | 568 | static int do_balance_runtime(struct rt_rq *rt_rq) |
| 567 | { | 569 | { |
| 568 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 570 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| 569 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | 571 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; |
| 570 | int i, weight, more = 0; | 572 | int i, weight, more = 0; |
| 571 | u64 rt_period; | 573 | u64 rt_period; |
| 572 | 574 | ||
| @@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq) | |||
| 925 | return; | 927 | return; |
| 926 | 928 | ||
| 927 | delta_exec = rq->clock_task - curr->se.exec_start; | 929 | delta_exec = rq->clock_task - curr->se.exec_start; |
| 928 | if (unlikely((s64)delta_exec < 0)) | 930 | if (unlikely((s64)delta_exec <= 0)) |
| 929 | delta_exec = 0; | 931 | return; |
| 930 | 932 | ||
| 931 | schedstat_set(curr->se.statistics.exec_max, | 933 | schedstat_set(curr->se.statistics.exec_max, |
| 932 | max(curr->se.statistics.exec_max, delta_exec)); | 934 | max(curr->se.statistics.exec_max, delta_exec)); |
| @@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1427 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1429 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1428 | { | 1430 | { |
| 1429 | if (!task_running(rq, p) && | 1431 | if (!task_running(rq, p) && |
| 1430 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1432 | cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
| 1431 | (p->nr_cpus_allowed > 1)) | ||
| 1432 | return 1; | 1433 | return 1; |
| 1433 | return 0; | 1434 | return 0; |
| 1434 | } | 1435 | } |
| @@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1889 | * we may need to handle the pulling of RT tasks | 1890 | * we may need to handle the pulling of RT tasks |
| 1890 | * now. | 1891 | * now. |
| 1891 | */ | 1892 | */ |
| 1892 | if (p->on_rq && !rq->rt.rt_nr_running) | 1893 | if (!p->on_rq || rq->rt.rt_nr_running) |
| 1893 | pull_rt_task(rq); | 1894 | return; |
| 1895 | |||
| 1896 | if (pull_rt_task(rq)) | ||
| 1897 | resched_task(rq->curr); | ||
| 1894 | } | 1898 | } |
| 1895 | 1899 | ||
| 1896 | void init_sched_rt_class(void) | 1900 | void init_sched_rt_class(void) |
| @@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
| 1985 | if (soft != RLIM_INFINITY) { | 1989 | if (soft != RLIM_INFINITY) { |
| 1986 | unsigned long next; | 1990 | unsigned long next; |
| 1987 | 1991 | ||
| 1988 | p->rt.timeout++; | 1992 | if (p->rt.watchdog_stamp != jiffies) { |
| 1993 | p->rt.timeout++; | ||
| 1994 | p->rt.watchdog_stamp = jiffies; | ||
| 1995 | } | ||
| 1996 | |||
| 1989 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | 1997 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); |
| 1990 | if (p->rt.timeout > next) | 1998 | if (p->rt.timeout > next) |
| 1991 | p->cputime_expires.sched_exp = p->se.sum_exec_runtime; | 1999 | p->cputime_expires.sched_exp = p->se.sum_exec_runtime; |
| @@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 2010 | if (--p->rt.time_slice) | 2018 | if (--p->rt.time_slice) |
| 2011 | return; | 2019 | return; |
| 2012 | 2020 | ||
| 2013 | p->rt.time_slice = RR_TIMESLICE; | 2021 | p->rt.time_slice = sched_rr_timeslice; |
| 2014 | 2022 | ||
| 2015 | /* | 2023 | /* |
| 2016 | * Requeue to the end of queue if we (and all of our ancestors) are the | 2024 | * Requeue to the end of queue if we (and all of our ancestors) are the |
| @@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
| 2041 | * Time slice is 0 for SCHED_FIFO tasks | 2049 | * Time slice is 0 for SCHED_FIFO tasks |
| 2042 | */ | 2050 | */ |
| 2043 | if (task->policy == SCHED_RR) | 2051 | if (task->policy == SCHED_RR) |
| 2044 | return RR_TIMESLICE; | 2052 | return sched_rr_timeslice; |
| 2045 | else | 2053 | else |
| 2046 | return 0; | 2054 | return 0; |
| 2047 | } | 2055 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc886441436a..cc03cfdf469f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -1,5 +1,7 @@ | |||
| 1 | 1 | ||
| 2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
| 3 | #include <linux/sched/sysctl.h> | ||
| 4 | #include <linux/sched/rt.h> | ||
| 3 | #include <linux/mutex.h> | 5 | #include <linux/mutex.h> |
| 4 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
| 5 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 903ffa9e8872..e036eda1a9c9 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
| @@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 21 | if (mask_str == NULL) | 21 | if (mask_str == NULL) |
| 22 | return -ENOMEM; | 22 | return -ENOMEM; |
| 23 | 23 | ||
| 24 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 24 | if (v == (void *)1) { |
| 25 | seq_printf(seq, "timestamp %lu\n", jiffies); | 25 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
| 26 | for_each_online_cpu(cpu) { | 26 | seq_printf(seq, "timestamp %lu\n", jiffies); |
| 27 | struct rq *rq = cpu_rq(cpu); | 27 | } else { |
| 28 | struct rq *rq; | ||
| 28 | #ifdef CONFIG_SMP | 29 | #ifdef CONFIG_SMP |
| 29 | struct sched_domain *sd; | 30 | struct sched_domain *sd; |
| 30 | int dcount = 0; | 31 | int dcount = 0; |
| 31 | #endif | 32 | #endif |
| 33 | cpu = (unsigned long)(v - 2); | ||
| 34 | rq = cpu_rq(cpu); | ||
| 32 | 35 | ||
| 33 | /* runqueue-specific stats */ | 36 | /* runqueue-specific stats */ |
| 34 | seq_printf(seq, | 37 | seq_printf(seq, |
| @@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 77 | return 0; | 80 | return 0; |
| 78 | } | 81 | } |
| 79 | 82 | ||
| 80 | static int schedstat_open(struct inode *inode, struct file *file) | 83 | /* |
| 84 | * This itererator needs some explanation. | ||
| 85 | * It returns 1 for the header position. | ||
| 86 | * This means 2 is cpu 0. | ||
| 87 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | ||
| 88 | * to use cpumask_* to iterate over the cpus. | ||
| 89 | */ | ||
| 90 | static void *schedstat_start(struct seq_file *file, loff_t *offset) | ||
| 81 | { | 91 | { |
| 82 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | 92 | unsigned long n = *offset; |
| 83 | char *buf = kmalloc(size, GFP_KERNEL); | ||
| 84 | struct seq_file *m; | ||
| 85 | int res; | ||
| 86 | 93 | ||
| 87 | if (!buf) | 94 | if (n == 0) |
| 88 | return -ENOMEM; | 95 | return (void *) 1; |
| 89 | res = single_open(file, show_schedstat, NULL); | 96 | |
| 90 | if (!res) { | 97 | n--; |
| 91 | m = file->private_data; | 98 | |
| 92 | m->buf = buf; | 99 | if (n > 0) |
| 93 | m->size = size; | 100 | n = cpumask_next(n - 1, cpu_online_mask); |
| 94 | } else | 101 | else |
| 95 | kfree(buf); | 102 | n = cpumask_first(cpu_online_mask); |
| 96 | return res; | 103 | |
| 104 | *offset = n + 1; | ||
| 105 | |||
| 106 | if (n < nr_cpu_ids) | ||
| 107 | return (void *)(unsigned long)(n + 2); | ||
| 108 | return NULL; | ||
| 109 | } | ||
| 110 | |||
| 111 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) | ||
| 112 | { | ||
| 113 | (*offset)++; | ||
| 114 | return schedstat_start(file, offset); | ||
| 115 | } | ||
| 116 | |||
| 117 | static void schedstat_stop(struct seq_file *file, void *data) | ||
| 118 | { | ||
| 119 | } | ||
| 120 | |||
| 121 | static const struct seq_operations schedstat_sops = { | ||
| 122 | .start = schedstat_start, | ||
| 123 | .next = schedstat_next, | ||
| 124 | .stop = schedstat_stop, | ||
| 125 | .show = show_schedstat, | ||
| 126 | }; | ||
| 127 | |||
| 128 | static int schedstat_open(struct inode *inode, struct file *file) | ||
| 129 | { | ||
| 130 | return seq_open(file, &schedstat_sops); | ||
| 97 | } | 131 | } |
| 98 | 132 | ||
| 133 | static int schedstat_release(struct inode *inode, struct file *file) | ||
| 134 | { | ||
| 135 | return 0; | ||
| 136 | }; | ||
| 137 | |||
| 99 | static const struct file_operations proc_schedstat_operations = { | 138 | static const struct file_operations proc_schedstat_operations = { |
| 100 | .open = schedstat_open, | 139 | .open = schedstat_open, |
| 101 | .read = seq_read, | 140 | .read = seq_read, |
| 102 | .llseek = seq_lseek, | 141 | .llseek = seq_lseek, |
| 103 | .release = single_release, | 142 | .release = schedstat_release, |
| 104 | }; | 143 | }; |
| 105 | 144 | ||
| 106 | static int __init proc_schedstat_init(void) | 145 | static int __init proc_schedstat_init(void) |
diff --git a/kernel/signal.c b/kernel/signal.c index 372771e948c2..2676aac4103d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 680 | * No need to set need_resched since signal event passing | 680 | * No need to set need_resched since signal event passing |
| 681 | * goes through ->blocked | 681 | * goes through ->blocked |
| 682 | */ | 682 | */ |
| 683 | void signal_wake_up(struct task_struct *t, int resume) | 683 | void signal_wake_up_state(struct task_struct *t, unsigned int state) |
| 684 | { | 684 | { |
| 685 | unsigned int mask; | ||
| 686 | |||
| 687 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 685 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
| 688 | |||
| 689 | /* | 686 | /* |
| 690 | * For SIGKILL, we want to wake it up in the stopped/traced/killable | 687 | * TASK_WAKEKILL also means wake it up in the stopped/traced/killable |
| 691 | * case. We don't check t->state here because there is a race with it | 688 | * case. We don't check t->state here because there is a race with it |
| 692 | * executing another processor and just now entering stopped state. | 689 | * executing another processor and just now entering stopped state. |
| 693 | * By using wake_up_state, we ensure the process will wake up and | 690 | * By using wake_up_state, we ensure the process will wake up and |
| 694 | * handle its death signal. | 691 | * handle its death signal. |
| 695 | */ | 692 | */ |
| 696 | mask = TASK_INTERRUPTIBLE; | 693 | if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) |
| 697 | if (resume) | ||
| 698 | mask |= TASK_WAKEKILL; | ||
| 699 | if (!wake_up_state(t, mask)) | ||
| 700 | kick_process(t); | 694 | kick_process(t); |
| 701 | } | 695 | } |
| 702 | 696 | ||
| @@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t) | |||
| 844 | assert_spin_locked(&t->sighand->siglock); | 838 | assert_spin_locked(&t->sighand->siglock); |
| 845 | 839 | ||
| 846 | task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); | 840 | task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); |
| 847 | signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); | 841 | ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); |
| 848 | } | 842 | } |
| 849 | 843 | ||
| 850 | /* | 844 | /* |
| @@ -1163,11 +1157,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 1163 | static void print_fatal_signal(int signr) | 1157 | static void print_fatal_signal(int signr) |
| 1164 | { | 1158 | { |
| 1165 | struct pt_regs *regs = signal_pt_regs(); | 1159 | struct pt_regs *regs = signal_pt_regs(); |
| 1166 | printk("%s/%d: potentially unexpected fatal signal %d.\n", | 1160 | printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", |
| 1167 | current->comm, task_pid_nr(current), signr); | 1161 | current->comm, task_pid_nr(current), signr); |
| 1168 | 1162 | ||
| 1169 | #if defined(__i386__) && !defined(__arch_um__) | 1163 | #if defined(__i386__) && !defined(__arch_um__) |
| 1170 | printk("code at %08lx: ", regs->ip); | 1164 | printk(KERN_INFO "code at %08lx: ", regs->ip); |
| 1171 | { | 1165 | { |
| 1172 | int i; | 1166 | int i; |
| 1173 | for (i = 0; i < 16; i++) { | 1167 | for (i = 0; i < 16; i++) { |
| @@ -1175,11 +1169,11 @@ static void print_fatal_signal(int signr) | |||
| 1175 | 1169 | ||
| 1176 | if (get_user(insn, (unsigned char *)(regs->ip + i))) | 1170 | if (get_user(insn, (unsigned char *)(regs->ip + i))) |
| 1177 | break; | 1171 | break; |
| 1178 | printk("%02x ", insn); | 1172 | printk(KERN_CONT "%02x ", insn); |
| 1179 | } | 1173 | } |
| 1180 | } | 1174 | } |
| 1175 | printk(KERN_CONT "\n"); | ||
| 1181 | #endif | 1176 | #endif |
| 1182 | printk("\n"); | ||
| 1183 | preempt_disable(); | 1177 | preempt_disable(); |
| 1184 | show_regs(regs); | 1178 | show_regs(regs); |
| 1185 | preempt_enable(); | 1179 | preempt_enable(); |
| @@ -1638,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1638 | unsigned long flags; | 1632 | unsigned long flags; |
| 1639 | struct sighand_struct *psig; | 1633 | struct sighand_struct *psig; |
| 1640 | bool autoreap = false; | 1634 | bool autoreap = false; |
| 1635 | cputime_t utime, stime; | ||
| 1641 | 1636 | ||
| 1642 | BUG_ON(sig == -1); | 1637 | BUG_ON(sig == -1); |
| 1643 | 1638 | ||
| @@ -1675,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1675 | task_uid(tsk)); | 1670 | task_uid(tsk)); |
| 1676 | rcu_read_unlock(); | 1671 | rcu_read_unlock(); |
| 1677 | 1672 | ||
| 1678 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); | 1673 | task_cputime(tsk, &utime, &stime); |
| 1679 | info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); | 1674 | info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); |
| 1675 | info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime); | ||
| 1680 | 1676 | ||
| 1681 | info.si_status = tsk->exit_code & 0x7f; | 1677 | info.si_status = tsk->exit_code & 0x7f; |
| 1682 | if (tsk->exit_code & 0x80) | 1678 | if (tsk->exit_code & 0x80) |
| @@ -1740,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
| 1740 | unsigned long flags; | 1736 | unsigned long flags; |
| 1741 | struct task_struct *parent; | 1737 | struct task_struct *parent; |
| 1742 | struct sighand_struct *sighand; | 1738 | struct sighand_struct *sighand; |
| 1739 | cputime_t utime, stime; | ||
| 1743 | 1740 | ||
| 1744 | if (for_ptracer) { | 1741 | if (for_ptracer) { |
| 1745 | parent = tsk->parent; | 1742 | parent = tsk->parent; |
| @@ -1758,8 +1755,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
| 1758 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); | 1755 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); |
| 1759 | rcu_read_unlock(); | 1756 | rcu_read_unlock(); |
| 1760 | 1757 | ||
| 1761 | info.si_utime = cputime_to_clock_t(tsk->utime); | 1758 | task_cputime(tsk, &utime, &stime); |
| 1762 | info.si_stime = cputime_to_clock_t(tsk->stime); | 1759 | info.si_utime = cputime_to_clock_t(utime); |
| 1760 | info.si_stime = cputime_to_clock_t(stime); | ||
| 1763 | 1761 | ||
| 1764 | info.si_code = why; | 1762 | info.si_code = why; |
| 1765 | switch (why) { | 1763 | switch (why) { |
| @@ -1800,6 +1798,10 @@ static inline int may_ptrace_stop(void) | |||
| 1800 | * If SIGKILL was already sent before the caller unlocked | 1798 | * If SIGKILL was already sent before the caller unlocked |
| 1801 | * ->siglock we must see ->core_state != NULL. Otherwise it | 1799 | * ->siglock we must see ->core_state != NULL. Otherwise it |
| 1802 | * is safe to enter schedule(). | 1800 | * is safe to enter schedule(). |
| 1801 | * | ||
| 1802 | * This is almost outdated, a task with the pending SIGKILL can't | ||
| 1803 | * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported | ||
| 1804 | * after SIGKILL was already dequeued. | ||
| 1803 | */ | 1805 | */ |
| 1804 | if (unlikely(current->mm->core_state) && | 1806 | if (unlikely(current->mm->core_state) && |
| 1805 | unlikely(current->mm == current->parent->mm)) | 1807 | unlikely(current->mm == current->parent->mm)) |
| @@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
| 1925 | if (gstop_done) | 1927 | if (gstop_done) |
| 1926 | do_notify_parent_cldstop(current, false, why); | 1928 | do_notify_parent_cldstop(current, false, why); |
| 1927 | 1929 | ||
| 1930 | /* tasklist protects us from ptrace_freeze_traced() */ | ||
| 1928 | __set_current_state(TASK_RUNNING); | 1931 | __set_current_state(TASK_RUNNING); |
| 1929 | if (clear_code) | 1932 | if (clear_code) |
| 1930 | current->exit_code = 0; | 1933 | current->exit_code = 0; |
| @@ -2396,6 +2399,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, | |||
| 2396 | tracehook_signal_handler(sig, info, ka, regs, stepping); | 2399 | tracehook_signal_handler(sig, info, ka, regs, stepping); |
| 2397 | } | 2400 | } |
| 2398 | 2401 | ||
| 2402 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) | ||
| 2403 | { | ||
| 2404 | if (failed) | ||
| 2405 | force_sigsegv(ksig->sig, current); | ||
| 2406 | else | ||
| 2407 | signal_delivered(ksig->sig, &ksig->info, &ksig->ka, | ||
| 2408 | signal_pt_regs(), stepping); | ||
| 2409 | } | ||
| 2410 | |||
| 2399 | /* | 2411 | /* |
| 2400 | * It could be that complete_signal() picked us to notify about the | 2412 | * It could be that complete_signal() picked us to notify about the |
| 2401 | * group-wide signal. Other threads should be notified now to take | 2413 | * group-wide signal. Other threads should be notified now to take |
| @@ -2613,28 +2625,58 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, | |||
| 2613 | return 0; | 2625 | return 0; |
| 2614 | } | 2626 | } |
| 2615 | 2627 | ||
| 2616 | long do_sigpending(void __user *set, unsigned long sigsetsize) | 2628 | #ifdef CONFIG_COMPAT |
| 2629 | COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, | ||
| 2630 | compat_sigset_t __user *, oset, compat_size_t, sigsetsize) | ||
| 2617 | { | 2631 | { |
| 2618 | long error = -EINVAL; | 2632 | #ifdef __BIG_ENDIAN |
| 2619 | sigset_t pending; | 2633 | sigset_t old_set = current->blocked; |
| 2620 | 2634 | ||
| 2635 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
| 2636 | if (sigsetsize != sizeof(sigset_t)) | ||
| 2637 | return -EINVAL; | ||
| 2638 | |||
| 2639 | if (nset) { | ||
| 2640 | compat_sigset_t new32; | ||
| 2641 | sigset_t new_set; | ||
| 2642 | int error; | ||
| 2643 | if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) | ||
| 2644 | return -EFAULT; | ||
| 2645 | |||
| 2646 | sigset_from_compat(&new_set, &new32); | ||
| 2647 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
| 2648 | |||
| 2649 | error = sigprocmask(how, &new_set, NULL); | ||
| 2650 | if (error) | ||
| 2651 | return error; | ||
| 2652 | } | ||
| 2653 | if (oset) { | ||
| 2654 | compat_sigset_t old32; | ||
| 2655 | sigset_to_compat(&old32, &old_set); | ||
| 2656 | if (copy_to_user(oset, &old_set, sizeof(sigset_t))) | ||
| 2657 | return -EFAULT; | ||
| 2658 | } | ||
| 2659 | return 0; | ||
| 2660 | #else | ||
| 2661 | return sys_rt_sigprocmask(how, (sigset_t __user *)nset, | ||
| 2662 | (sigset_t __user *)oset, sigsetsize); | ||
| 2663 | #endif | ||
| 2664 | } | ||
| 2665 | #endif | ||
| 2666 | |||
| 2667 | static int do_sigpending(void *set, unsigned long sigsetsize) | ||
| 2668 | { | ||
| 2621 | if (sigsetsize > sizeof(sigset_t)) | 2669 | if (sigsetsize > sizeof(sigset_t)) |
| 2622 | goto out; | 2670 | return -EINVAL; |
| 2623 | 2671 | ||
| 2624 | spin_lock_irq(¤t->sighand->siglock); | 2672 | spin_lock_irq(¤t->sighand->siglock); |
| 2625 | sigorsets(&pending, ¤t->pending.signal, | 2673 | sigorsets(set, ¤t->pending.signal, |
| 2626 | ¤t->signal->shared_pending.signal); | 2674 | ¤t->signal->shared_pending.signal); |
| 2627 | spin_unlock_irq(¤t->sighand->siglock); | 2675 | spin_unlock_irq(¤t->sighand->siglock); |
| 2628 | 2676 | ||
| 2629 | /* Outside the lock because only this thread touches it. */ | 2677 | /* Outside the lock because only this thread touches it. */ |
| 2630 | sigandsets(&pending, ¤t->blocked, &pending); | 2678 | sigandsets(set, ¤t->blocked, set); |
| 2631 | 2679 | return 0; | |
| 2632 | error = -EFAULT; | ||
| 2633 | if (!copy_to_user(set, &pending, sigsetsize)) | ||
| 2634 | error = 0; | ||
| 2635 | |||
| 2636 | out: | ||
| 2637 | return error; | ||
| 2638 | } | 2680 | } |
| 2639 | 2681 | ||
| 2640 | /** | 2682 | /** |
| @@ -2643,11 +2685,36 @@ out: | |||
| 2643 | * @set: stores pending signals | 2685 | * @set: stores pending signals |
| 2644 | * @sigsetsize: size of sigset_t type or larger | 2686 | * @sigsetsize: size of sigset_t type or larger |
| 2645 | */ | 2687 | */ |
| 2646 | SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) | 2688 | SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) |
| 2647 | { | 2689 | { |
| 2648 | return do_sigpending(set, sigsetsize); | 2690 | sigset_t set; |
| 2691 | int err = do_sigpending(&set, sigsetsize); | ||
| 2692 | if (!err && copy_to_user(uset, &set, sigsetsize)) | ||
| 2693 | err = -EFAULT; | ||
| 2694 | return err; | ||
| 2649 | } | 2695 | } |
| 2650 | 2696 | ||
| 2697 | #ifdef CONFIG_COMPAT | ||
| 2698 | COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, | ||
| 2699 | compat_size_t, sigsetsize) | ||
| 2700 | { | ||
| 2701 | #ifdef __BIG_ENDIAN | ||
| 2702 | sigset_t set; | ||
| 2703 | int err = do_sigpending(&set, sigsetsize); | ||
| 2704 | if (!err) { | ||
| 2705 | compat_sigset_t set32; | ||
| 2706 | sigset_to_compat(&set32, &set); | ||
| 2707 | /* we can get here only if sigsetsize <= sizeof(set) */ | ||
| 2708 | if (copy_to_user(uset, &set32, sigsetsize)) | ||
| 2709 | err = -EFAULT; | ||
| 2710 | } | ||
| 2711 | return err; | ||
| 2712 | #else | ||
| 2713 | return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); | ||
| 2714 | #endif | ||
| 2715 | } | ||
| 2716 | #endif | ||
| 2717 | |||
| 2651 | #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER | 2718 | #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER |
| 2652 | 2719 | ||
| 2653 | int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | 2720 | int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) |
| @@ -2924,6 +2991,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) | |||
| 2924 | return do_tkill(0, pid, sig); | 2991 | return do_tkill(0, pid, sig); |
| 2925 | } | 2992 | } |
| 2926 | 2993 | ||
| 2994 | static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) | ||
| 2995 | { | ||
| 2996 | /* Not even root can pretend to send signals from the kernel. | ||
| 2997 | * Nor can they impersonate a kill()/tgkill(), which adds source info. | ||
| 2998 | */ | ||
| 2999 | if ((info->si_code >= 0 || info->si_code == SI_TKILL) && | ||
| 3000 | (task_pid_vnr(current) != pid)) { | ||
| 3001 | /* We used to allow any < 0 si_code */ | ||
| 3002 | WARN_ON_ONCE(info->si_code < 0); | ||
| 3003 | return -EPERM; | ||
| 3004 | } | ||
| 3005 | info->si_signo = sig; | ||
| 3006 | |||
| 3007 | /* POSIX.1b doesn't mention process groups. */ | ||
| 3008 | return kill_proc_info(sig, info, pid); | ||
| 3009 | } | ||
| 3010 | |||
| 2927 | /** | 3011 | /** |
| 2928 | * sys_rt_sigqueueinfo - send signal information to a signal | 3012 | * sys_rt_sigqueueinfo - send signal information to a signal |
| 2929 | * @pid: the PID of the thread | 3013 | * @pid: the PID of the thread |
| @@ -2934,25 +3018,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, | |||
| 2934 | siginfo_t __user *, uinfo) | 3018 | siginfo_t __user *, uinfo) |
| 2935 | { | 3019 | { |
| 2936 | siginfo_t info; | 3020 | siginfo_t info; |
| 2937 | |||
| 2938 | if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) | 3021 | if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) |
| 2939 | return -EFAULT; | 3022 | return -EFAULT; |
| 3023 | return do_rt_sigqueueinfo(pid, sig, &info); | ||
| 3024 | } | ||
| 2940 | 3025 | ||
| 2941 | /* Not even root can pretend to send signals from the kernel. | 3026 | #ifdef CONFIG_COMPAT |
| 2942 | * Nor can they impersonate a kill()/tgkill(), which adds source info. | 3027 | COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, |
| 2943 | */ | 3028 | compat_pid_t, pid, |
| 2944 | if (info.si_code >= 0 || info.si_code == SI_TKILL) { | 3029 | int, sig, |
| 2945 | /* We used to allow any < 0 si_code */ | 3030 | struct compat_siginfo __user *, uinfo) |
| 2946 | WARN_ON_ONCE(info.si_code < 0); | 3031 | { |
| 2947 | return -EPERM; | 3032 | siginfo_t info; |
| 2948 | } | 3033 | int ret = copy_siginfo_from_user32(&info, uinfo); |
| 2949 | info.si_signo = sig; | 3034 | if (unlikely(ret)) |
| 2950 | 3035 | return ret; | |
| 2951 | /* POSIX.1b doesn't mention process groups. */ | 3036 | return do_rt_sigqueueinfo(pid, sig, &info); |
| 2952 | return kill_proc_info(sig, &info, pid); | ||
| 2953 | } | 3037 | } |
| 3038 | #endif | ||
| 2954 | 3039 | ||
| 2955 | long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) | 3040 | static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) |
| 2956 | { | 3041 | { |
| 2957 | /* This is only valid for single tasks */ | 3042 | /* This is only valid for single tasks */ |
| 2958 | if (pid <= 0 || tgid <= 0) | 3043 | if (pid <= 0 || tgid <= 0) |
| @@ -2961,7 +3046,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) | |||
| 2961 | /* Not even root can pretend to send signals from the kernel. | 3046 | /* Not even root can pretend to send signals from the kernel. |
| 2962 | * Nor can they impersonate a kill()/tgkill(), which adds source info. | 3047 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
| 2963 | */ | 3048 | */ |
| 2964 | if (info->si_code >= 0 || info->si_code == SI_TKILL) { | 3049 | if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && |
| 3050 | (task_pid_vnr(current) != pid)) { | ||
| 2965 | /* We used to allow any < 0 si_code */ | 3051 | /* We used to allow any < 0 si_code */ |
| 2966 | WARN_ON_ONCE(info->si_code < 0); | 3052 | WARN_ON_ONCE(info->si_code < 0); |
| 2967 | return -EPERM; | 3053 | return -EPERM; |
| @@ -2982,6 +3068,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, | |||
| 2982 | return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); | 3068 | return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); |
| 2983 | } | 3069 | } |
| 2984 | 3070 | ||
| 3071 | #ifdef CONFIG_COMPAT | ||
| 3072 | COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, | ||
| 3073 | compat_pid_t, tgid, | ||
| 3074 | compat_pid_t, pid, | ||
| 3075 | int, sig, | ||
| 3076 | struct compat_siginfo __user *, uinfo) | ||
| 3077 | { | ||
| 3078 | siginfo_t info; | ||
| 3079 | |||
| 3080 | if (copy_siginfo_from_user32(&info, uinfo)) | ||
| 3081 | return -EFAULT; | ||
| 3082 | return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); | ||
| 3083 | } | ||
| 3084 | #endif | ||
| 3085 | |||
| 2985 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | 3086 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
| 2986 | { | 3087 | { |
| 2987 | struct task_struct *t = current; | 3088 | struct task_struct *t = current; |
| @@ -3027,7 +3128,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
| 3027 | return 0; | 3128 | return 0; |
| 3028 | } | 3129 | } |
| 3029 | 3130 | ||
| 3030 | int | 3131 | static int |
| 3031 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) | 3132 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) |
| 3032 | { | 3133 | { |
| 3033 | stack_t oss; | 3134 | stack_t oss; |
| @@ -3092,12 +3193,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s | |||
| 3092 | out: | 3193 | out: |
| 3093 | return error; | 3194 | return error; |
| 3094 | } | 3195 | } |
| 3095 | #ifdef CONFIG_GENERIC_SIGALTSTACK | ||
| 3096 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) | 3196 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) |
| 3097 | { | 3197 | { |
| 3098 | return do_sigaltstack(uss, uoss, current_user_stack_pointer()); | 3198 | return do_sigaltstack(uss, uoss, current_user_stack_pointer()); |
| 3099 | } | 3199 | } |
| 3100 | #endif | ||
| 3101 | 3200 | ||
| 3102 | int restore_altstack(const stack_t __user *uss) | 3201 | int restore_altstack(const stack_t __user *uss) |
| 3103 | { | 3202 | { |
| @@ -3115,9 +3214,9 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) | |||
| 3115 | } | 3214 | } |
| 3116 | 3215 | ||
| 3117 | #ifdef CONFIG_COMPAT | 3216 | #ifdef CONFIG_COMPAT |
| 3118 | #ifdef CONFIG_GENERIC_SIGALTSTACK | 3217 | COMPAT_SYSCALL_DEFINE2(sigaltstack, |
| 3119 | asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, | 3218 | const compat_stack_t __user *, uss_ptr, |
| 3120 | compat_stack_t __user *uoss_ptr) | 3219 | compat_stack_t __user *, uoss_ptr) |
| 3121 | { | 3220 | { |
| 3122 | stack_t uss, uoss; | 3221 | stack_t uss, uoss; |
| 3123 | int ret; | 3222 | int ret; |
| @@ -3164,7 +3263,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) | |||
| 3164 | __put_user(t->sas_ss_size, &uss->ss_size); | 3263 | __put_user(t->sas_ss_size, &uss->ss_size); |
| 3165 | } | 3264 | } |
| 3166 | #endif | 3265 | #endif |
| 3167 | #endif | ||
| 3168 | 3266 | ||
| 3169 | #ifdef __ARCH_WANT_SYS_SIGPENDING | 3267 | #ifdef __ARCH_WANT_SYS_SIGPENDING |
| 3170 | 3268 | ||
| @@ -3174,7 +3272,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) | |||
| 3174 | */ | 3272 | */ |
| 3175 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | 3273 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) |
| 3176 | { | 3274 | { |
| 3177 | return do_sigpending(set, sizeof(*set)); | 3275 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); |
| 3178 | } | 3276 | } |
| 3179 | 3277 | ||
| 3180 | #endif | 3278 | #endif |
| @@ -3230,7 +3328,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, | |||
| 3230 | } | 3328 | } |
| 3231 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ | 3329 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ |
| 3232 | 3330 | ||
| 3233 | #ifdef __ARCH_WANT_SYS_RT_SIGACTION | 3331 | #ifndef CONFIG_ODD_RT_SIGACTION |
| 3234 | /** | 3332 | /** |
| 3235 | * sys_rt_sigaction - alter an action taken by a process | 3333 | * sys_rt_sigaction - alter an action taken by a process |
| 3236 | * @sig: signal to be sent | 3334 | * @sig: signal to be sent |
| @@ -3264,7 +3362,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, | |||
| 3264 | out: | 3362 | out: |
| 3265 | return ret; | 3363 | return ret; |
| 3266 | } | 3364 | } |
| 3267 | #endif /* __ARCH_WANT_SYS_RT_SIGACTION */ | 3365 | #ifdef CONFIG_COMPAT |
| 3366 | COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, | ||
| 3367 | const struct compat_sigaction __user *, act, | ||
| 3368 | struct compat_sigaction __user *, oact, | ||
| 3369 | compat_size_t, sigsetsize) | ||
| 3370 | { | ||
| 3371 | struct k_sigaction new_ka, old_ka; | ||
| 3372 | compat_sigset_t mask; | ||
| 3373 | #ifdef __ARCH_HAS_SA_RESTORER | ||
| 3374 | compat_uptr_t restorer; | ||
| 3375 | #endif | ||
| 3376 | int ret; | ||
| 3377 | |||
| 3378 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
| 3379 | if (sigsetsize != sizeof(compat_sigset_t)) | ||
| 3380 | return -EINVAL; | ||
| 3381 | |||
| 3382 | if (act) { | ||
| 3383 | compat_uptr_t handler; | ||
| 3384 | ret = get_user(handler, &act->sa_handler); | ||
| 3385 | new_ka.sa.sa_handler = compat_ptr(handler); | ||
| 3386 | #ifdef __ARCH_HAS_SA_RESTORER | ||
| 3387 | ret |= get_user(restorer, &act->sa_restorer); | ||
| 3388 | new_ka.sa.sa_restorer = compat_ptr(restorer); | ||
| 3389 | #endif | ||
| 3390 | ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); | ||
| 3391 | ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); | ||
| 3392 | if (ret) | ||
| 3393 | return -EFAULT; | ||
| 3394 | sigset_from_compat(&new_ka.sa.sa_mask, &mask); | ||
| 3395 | } | ||
| 3396 | |||
| 3397 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
| 3398 | if (!ret && oact) { | ||
| 3399 | sigset_to_compat(&mask, &old_ka.sa.sa_mask); | ||
| 3400 | ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), | ||
| 3401 | &oact->sa_handler); | ||
| 3402 | ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); | ||
| 3403 | ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); | ||
| 3404 | #ifdef __ARCH_HAS_SA_RESTORER | ||
| 3405 | ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), | ||
| 3406 | &oact->sa_restorer); | ||
| 3407 | #endif | ||
| 3408 | } | ||
| 3409 | return ret; | ||
| 3410 | } | ||
| 3411 | #endif | ||
| 3412 | #endif /* !CONFIG_ODD_RT_SIGACTION */ | ||
| 3413 | |||
| 3414 | #ifdef CONFIG_OLD_SIGACTION | ||
| 3415 | SYSCALL_DEFINE3(sigaction, int, sig, | ||
| 3416 | const struct old_sigaction __user *, act, | ||
| 3417 | struct old_sigaction __user *, oact) | ||
| 3418 | { | ||
| 3419 | struct k_sigaction new_ka, old_ka; | ||
| 3420 | int ret; | ||
| 3421 | |||
| 3422 | if (act) { | ||
| 3423 | old_sigset_t mask; | ||
| 3424 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
| 3425 | __get_user(new_ka.sa.sa_handler, &act->sa_handler) || | ||
| 3426 | __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || | ||
| 3427 | __get_user(new_ka.sa.sa_flags, &act->sa_flags) || | ||
| 3428 | __get_user(mask, &act->sa_mask)) | ||
| 3429 | return -EFAULT; | ||
| 3430 | #ifdef __ARCH_HAS_KA_RESTORER | ||
| 3431 | new_ka.ka_restorer = NULL; | ||
| 3432 | #endif | ||
| 3433 | siginitset(&new_ka.sa.sa_mask, mask); | ||
| 3434 | } | ||
| 3435 | |||
| 3436 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
| 3437 | |||
| 3438 | if (!ret && oact) { | ||
| 3439 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
| 3440 | __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || | ||
| 3441 | __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || | ||
| 3442 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || | ||
| 3443 | __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) | ||
| 3444 | return -EFAULT; | ||
| 3445 | } | ||
| 3446 | |||
| 3447 | return ret; | ||
| 3448 | } | ||
| 3449 | #endif | ||
| 3450 | #ifdef CONFIG_COMPAT_OLD_SIGACTION | ||
| 3451 | COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, | ||
| 3452 | const struct compat_old_sigaction __user *, act, | ||
| 3453 | struct compat_old_sigaction __user *, oact) | ||
| 3454 | { | ||
| 3455 | struct k_sigaction new_ka, old_ka; | ||
| 3456 | int ret; | ||
| 3457 | compat_old_sigset_t mask; | ||
| 3458 | compat_uptr_t handler, restorer; | ||
| 3459 | |||
| 3460 | if (act) { | ||
| 3461 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
| 3462 | __get_user(handler, &act->sa_handler) || | ||
| 3463 | __get_user(restorer, &act->sa_restorer) || | ||
| 3464 | __get_user(new_ka.sa.sa_flags, &act->sa_flags) || | ||
| 3465 | __get_user(mask, &act->sa_mask)) | ||
| 3466 | return -EFAULT; | ||
| 3467 | |||
| 3468 | #ifdef __ARCH_HAS_KA_RESTORER | ||
| 3469 | new_ka.ka_restorer = NULL; | ||
| 3470 | #endif | ||
| 3471 | new_ka.sa.sa_handler = compat_ptr(handler); | ||
| 3472 | new_ka.sa.sa_restorer = compat_ptr(restorer); | ||
| 3473 | siginitset(&new_ka.sa.sa_mask, mask); | ||
| 3474 | } | ||
| 3475 | |||
| 3476 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
| 3477 | |||
| 3478 | if (!ret && oact) { | ||
| 3479 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
| 3480 | __put_user(ptr_to_compat(old_ka.sa.sa_handler), | ||
| 3481 | &oact->sa_handler) || | ||
| 3482 | __put_user(ptr_to_compat(old_ka.sa.sa_restorer), | ||
| 3483 | &oact->sa_restorer) || | ||
| 3484 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || | ||
| 3485 | __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) | ||
| 3486 | return -EFAULT; | ||
| 3487 | } | ||
| 3488 | return ret; | ||
| 3489 | } | ||
| 3490 | #endif | ||
| 3268 | 3491 | ||
| 3269 | #ifdef __ARCH_WANT_SYS_SGETMASK | 3492 | #ifdef __ARCH_WANT_SYS_SGETMASK |
| 3270 | 3493 | ||
| @@ -3332,7 +3555,6 @@ int sigsuspend(sigset_t *set) | |||
| 3332 | return -ERESTARTNOHAND; | 3555 | return -ERESTARTNOHAND; |
| 3333 | } | 3556 | } |
| 3334 | 3557 | ||
| 3335 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | ||
| 3336 | /** | 3558 | /** |
| 3337 | * sys_rt_sigsuspend - replace the signal mask for a value with the | 3559 | * sys_rt_sigsuspend - replace the signal mask for a value with the |
| 3338 | * @unewset value until a signal is received | 3560 | * @unewset value until a signal is received |
| @@ -3351,7 +3573,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | |||
| 3351 | return -EFAULT; | 3573 | return -EFAULT; |
| 3352 | return sigsuspend(&newset); | 3574 | return sigsuspend(&newset); |
| 3353 | } | 3575 | } |
| 3354 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | 3576 | |
| 3577 | #ifdef CONFIG_COMPAT | ||
| 3578 | COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) | ||
| 3579 | { | ||
| 3580 | #ifdef __BIG_ENDIAN | ||
| 3581 | sigset_t newset; | ||
| 3582 | compat_sigset_t newset32; | ||
| 3583 | |||
| 3584 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
| 3585 | if (sigsetsize != sizeof(sigset_t)) | ||
| 3586 | return -EINVAL; | ||
| 3587 | |||
| 3588 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) | ||
| 3589 | return -EFAULT; | ||
| 3590 | sigset_from_compat(&newset, &newset32); | ||
| 3591 | return sigsuspend(&newset); | ||
| 3592 | #else | ||
| 3593 | /* on little-endian bitmaps don't care about granularity */ | ||
| 3594 | return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); | ||
| 3595 | #endif | ||
| 3596 | } | ||
| 3597 | #endif | ||
| 3598 | |||
| 3599 | #ifdef CONFIG_OLD_SIGSUSPEND | ||
| 3600 | SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) | ||
| 3601 | { | ||
| 3602 | sigset_t blocked; | ||
| 3603 | siginitset(&blocked, mask); | ||
| 3604 | return sigsuspend(&blocked); | ||
| 3605 | } | ||
| 3606 | #endif | ||
| 3607 | #ifdef CONFIG_OLD_SIGSUSPEND3 | ||
| 3608 | SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) | ||
| 3609 | { | ||
| 3610 | sigset_t blocked; | ||
| 3611 | siginitset(&blocked, mask); | ||
| 3612 | return sigsuspend(&blocked); | ||
| 3613 | } | ||
| 3614 | #endif | ||
| 3355 | 3615 | ||
| 3356 | __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) | 3616 | __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) |
| 3357 | { | 3617 | { |
diff --git a/kernel/smp.c b/kernel/smp.c index 29dd40a9f2f4..8e451f3ff51b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -16,23 +16,14 @@ | |||
| 16 | #include "smpboot.h" | 16 | #include "smpboot.h" |
| 17 | 17 | ||
| 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
| 19 | static struct { | ||
| 20 | struct list_head queue; | ||
| 21 | raw_spinlock_t lock; | ||
| 22 | } call_function __cacheline_aligned_in_smp = | ||
| 23 | { | ||
| 24 | .queue = LIST_HEAD_INIT(call_function.queue), | ||
| 25 | .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), | ||
| 26 | }; | ||
| 27 | |||
| 28 | enum { | 19 | enum { |
| 29 | CSD_FLAG_LOCK = 0x01, | 20 | CSD_FLAG_LOCK = 0x01, |
| 30 | }; | 21 | }; |
| 31 | 22 | ||
| 32 | struct call_function_data { | 23 | struct call_function_data { |
| 33 | struct call_single_data csd; | 24 | struct call_single_data __percpu *csd; |
| 34 | atomic_t refs; | ||
| 35 | cpumask_var_t cpumask; | 25 | cpumask_var_t cpumask; |
| 26 | cpumask_var_t cpumask_ipi; | ||
| 36 | }; | 27 | }; |
| 37 | 28 | ||
| 38 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | 29 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); |
| @@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 56 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, | 47 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, |
| 57 | cpu_to_node(cpu))) | 48 | cpu_to_node(cpu))) |
| 58 | return notifier_from_errno(-ENOMEM); | 49 | return notifier_from_errno(-ENOMEM); |
| 50 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, | ||
| 51 | cpu_to_node(cpu))) | ||
| 52 | return notifier_from_errno(-ENOMEM); | ||
| 53 | cfd->csd = alloc_percpu(struct call_single_data); | ||
| 54 | if (!cfd->csd) { | ||
| 55 | free_cpumask_var(cfd->cpumask); | ||
| 56 | return notifier_from_errno(-ENOMEM); | ||
| 57 | } | ||
| 59 | break; | 58 | break; |
| 60 | 59 | ||
| 61 | #ifdef CONFIG_HOTPLUG_CPU | 60 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 65 | case CPU_DEAD: | 64 | case CPU_DEAD: |
| 66 | case CPU_DEAD_FROZEN: | 65 | case CPU_DEAD_FROZEN: |
| 67 | free_cpumask_var(cfd->cpumask); | 66 | free_cpumask_var(cfd->cpumask); |
| 67 | free_cpumask_var(cfd->cpumask_ipi); | ||
| 68 | free_percpu(cfd->csd); | ||
| 68 | break; | 69 | break; |
| 69 | #endif | 70 | #endif |
| 70 | }; | 71 | }; |
| @@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) | |||
| 166 | } | 167 | } |
| 167 | 168 | ||
| 168 | /* | 169 | /* |
| 169 | * Invoked by arch to handle an IPI for call function. Must be called with | ||
| 170 | * interrupts disabled. | ||
| 171 | */ | ||
| 172 | void generic_smp_call_function_interrupt(void) | ||
| 173 | { | ||
| 174 | struct call_function_data *data; | ||
| 175 | int cpu = smp_processor_id(); | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Shouldn't receive this interrupt on a cpu that is not yet online. | ||
| 179 | */ | ||
| 180 | WARN_ON_ONCE(!cpu_online(cpu)); | ||
| 181 | |||
| 182 | /* | ||
| 183 | * Ensure entry is visible on call_function_queue after we have | ||
| 184 | * entered the IPI. See comment in smp_call_function_many. | ||
| 185 | * If we don't have this, then we may miss an entry on the list | ||
| 186 | * and never get another IPI to process it. | ||
| 187 | */ | ||
| 188 | smp_mb(); | ||
| 189 | |||
| 190 | /* | ||
| 191 | * It's ok to use list_for_each_rcu() here even though we may | ||
| 192 | * delete 'pos', since list_del_rcu() doesn't clear ->next | ||
| 193 | */ | ||
| 194 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | ||
| 195 | int refs; | ||
| 196 | smp_call_func_t func; | ||
| 197 | |||
| 198 | /* | ||
| 199 | * Since we walk the list without any locks, we might | ||
| 200 | * see an entry that was completed, removed from the | ||
| 201 | * list and is in the process of being reused. | ||
| 202 | * | ||
| 203 | * We must check that the cpu is in the cpumask before | ||
| 204 | * checking the refs, and both must be set before | ||
| 205 | * executing the callback on this cpu. | ||
| 206 | */ | ||
| 207 | |||
| 208 | if (!cpumask_test_cpu(cpu, data->cpumask)) | ||
| 209 | continue; | ||
| 210 | |||
| 211 | smp_rmb(); | ||
| 212 | |||
| 213 | if (atomic_read(&data->refs) == 0) | ||
| 214 | continue; | ||
| 215 | |||
| 216 | func = data->csd.func; /* save for later warn */ | ||
| 217 | func(data->csd.info); | ||
| 218 | |||
| 219 | /* | ||
| 220 | * If the cpu mask is not still set then func enabled | ||
| 221 | * interrupts (BUG), and this cpu took another smp call | ||
| 222 | * function interrupt and executed func(info) twice | ||
| 223 | * on this cpu. That nested execution decremented refs. | ||
| 224 | */ | ||
| 225 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { | ||
| 226 | WARN(1, "%pf enabled interrupts and double executed\n", func); | ||
| 227 | continue; | ||
| 228 | } | ||
| 229 | |||
| 230 | refs = atomic_dec_return(&data->refs); | ||
| 231 | WARN_ON(refs < 0); | ||
| 232 | |||
| 233 | if (refs) | ||
| 234 | continue; | ||
| 235 | |||
| 236 | WARN_ON(!cpumask_empty(data->cpumask)); | ||
| 237 | |||
| 238 | raw_spin_lock(&call_function.lock); | ||
| 239 | list_del_rcu(&data->csd.list); | ||
| 240 | raw_spin_unlock(&call_function.lock); | ||
| 241 | |||
| 242 | csd_unlock(&data->csd); | ||
| 243 | } | ||
| 244 | |||
| 245 | } | ||
| 246 | |||
| 247 | /* | ||
| 248 | * Invoked by arch to handle an IPI for call function single. Must be | 170 | * Invoked by arch to handle an IPI for call function single. Must be |
| 249 | * called from the arch with interrupts disabled. | 171 | * called from the arch with interrupts disabled. |
| 250 | */ | 172 | */ |
| @@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 448 | smp_call_func_t func, void *info, bool wait) | 370 | smp_call_func_t func, void *info, bool wait) |
| 449 | { | 371 | { |
| 450 | struct call_function_data *data; | 372 | struct call_function_data *data; |
| 451 | unsigned long flags; | 373 | int cpu, next_cpu, this_cpu = smp_processor_id(); |
| 452 | int refs, cpu, next_cpu, this_cpu = smp_processor_id(); | ||
| 453 | 374 | ||
| 454 | /* | 375 | /* |
| 455 | * Can deadlock when called with interrupts disabled. | 376 | * Can deadlock when called with interrupts disabled. |
| @@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 481 | } | 402 | } |
| 482 | 403 | ||
| 483 | data = &__get_cpu_var(cfd_data); | 404 | data = &__get_cpu_var(cfd_data); |
| 484 | csd_lock(&data->csd); | ||
| 485 | |||
| 486 | /* This BUG_ON verifies our reuse assertions and can be removed */ | ||
| 487 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | ||
| 488 | 405 | ||
| 489 | /* | ||
| 490 | * The global call function queue list add and delete are protected | ||
| 491 | * by a lock, but the list is traversed without any lock, relying | ||
| 492 | * on the rcu list add and delete to allow safe concurrent traversal. | ||
| 493 | * We reuse the call function data without waiting for any grace | ||
| 494 | * period after some other cpu removes it from the global queue. | ||
| 495 | * This means a cpu might find our data block as it is being | ||
| 496 | * filled out. | ||
| 497 | * | ||
| 498 | * We hold off the interrupt handler on the other cpu by | ||
| 499 | * ordering our writes to the cpu mask vs our setting of the | ||
| 500 | * refs counter. We assert only the cpu owning the data block | ||
| 501 | * will set a bit in cpumask, and each bit will only be cleared | ||
| 502 | * by the subject cpu. Each cpu must first find its bit is | ||
| 503 | * set and then check that refs is set indicating the element is | ||
| 504 | * ready to be processed, otherwise it must skip the entry. | ||
| 505 | * | ||
| 506 | * On the previous iteration refs was set to 0 by another cpu. | ||
| 507 | * To avoid the use of transitivity, set the counter to 0 here | ||
| 508 | * so the wmb will pair with the rmb in the interrupt handler. | ||
| 509 | */ | ||
| 510 | atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ | ||
| 511 | |||
| 512 | data->csd.func = func; | ||
| 513 | data->csd.info = info; | ||
| 514 | |||
| 515 | /* Ensure 0 refs is visible before mask. Also orders func and info */ | ||
| 516 | smp_wmb(); | ||
| 517 | |||
| 518 | /* We rely on the "and" being processed before the store */ | ||
| 519 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 406 | cpumask_and(data->cpumask, mask, cpu_online_mask); |
| 520 | cpumask_clear_cpu(this_cpu, data->cpumask); | 407 | cpumask_clear_cpu(this_cpu, data->cpumask); |
| 521 | refs = cpumask_weight(data->cpumask); | ||
| 522 | 408 | ||
| 523 | /* Some callers race with other cpus changing the passed mask */ | 409 | /* Some callers race with other cpus changing the passed mask */ |
| 524 | if (unlikely(!refs)) { | 410 | if (unlikely(!cpumask_weight(data->cpumask))) |
| 525 | csd_unlock(&data->csd); | ||
| 526 | return; | 411 | return; |
| 527 | } | ||
| 528 | 412 | ||
| 529 | raw_spin_lock_irqsave(&call_function.lock, flags); | ||
| 530 | /* | 413 | /* |
| 531 | * Place entry at the _HEAD_ of the list, so that any cpu still | 414 | * After we put an entry into the list, data->cpumask |
| 532 | * observing the entry in generic_smp_call_function_interrupt() | 415 | * may be cleared again when another CPU sends another IPI for |
| 533 | * will not miss any other list entries: | 416 | * a SMP function call, so data->cpumask will be zero. |
| 534 | */ | 417 | */ |
| 535 | list_add_rcu(&data->csd.list, &call_function.queue); | 418 | cpumask_copy(data->cpumask_ipi, data->cpumask); |
| 536 | /* | ||
| 537 | * We rely on the wmb() in list_add_rcu to complete our writes | ||
| 538 | * to the cpumask before this write to refs, which indicates | ||
| 539 | * data is on the list and is ready to be processed. | ||
| 540 | */ | ||
| 541 | atomic_set(&data->refs, refs); | ||
| 542 | raw_spin_unlock_irqrestore(&call_function.lock, flags); | ||
| 543 | 419 | ||
| 544 | /* | 420 | for_each_cpu(cpu, data->cpumask) { |
| 545 | * Make the list addition visible before sending the ipi. | 421 | struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); |
| 546 | * (IPIs must obey or appear to obey normal Linux cache | 422 | struct call_single_queue *dst = |
| 547 | * coherency rules -- see comment in generic_exec_single). | 423 | &per_cpu(call_single_queue, cpu); |
| 548 | */ | 424 | unsigned long flags; |
| 549 | smp_mb(); | 425 | |
| 426 | csd_lock(csd); | ||
| 427 | csd->func = func; | ||
| 428 | csd->info = info; | ||
| 429 | |||
| 430 | raw_spin_lock_irqsave(&dst->lock, flags); | ||
| 431 | list_add_tail(&csd->list, &dst->list); | ||
| 432 | raw_spin_unlock_irqrestore(&dst->lock, flags); | ||
| 433 | } | ||
| 550 | 434 | ||
| 551 | /* Send a message to all CPUs in the map */ | 435 | /* Send a message to all CPUs in the map */ |
| 552 | arch_send_call_function_ipi_mask(data->cpumask); | 436 | arch_send_call_function_ipi_mask(data->cpumask_ipi); |
| 553 | 437 | ||
| 554 | /* Optionally wait for the CPUs to complete */ | 438 | if (wait) { |
| 555 | if (wait) | 439 | for_each_cpu(cpu, data->cpumask) { |
| 556 | csd_lock_wait(&data->csd); | 440 | struct call_single_data *csd = |
| 441 | per_cpu_ptr(data->csd, cpu); | ||
| 442 | csd_lock_wait(csd); | ||
| 443 | } | ||
| 444 | } | ||
| 557 | } | 445 | } |
| 558 | EXPORT_SYMBOL(smp_call_function_many); | 446 | EXPORT_SYMBOL(smp_call_function_many); |
| 559 | 447 | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d6c5fc054242..b9bde5727829 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data) | |||
| 131 | continue; | 131 | continue; |
| 132 | } | 132 | } |
| 133 | 133 | ||
| 134 | BUG_ON(td->cpu != smp_processor_id()); | 134 | //BUG_ON(td->cpu != smp_processor_id()); |
| 135 | 135 | ||
| 136 | /* Check for state change setup */ | 136 | /* Check for state change setup */ |
| 137 | switch (td->status) { | 137 | switch (td->status) { |
| @@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | |||
| 183 | kfree(td); | 183 | kfree(td); |
| 184 | return PTR_ERR(tsk); | 184 | return PTR_ERR(tsk); |
| 185 | } | 185 | } |
| 186 | |||
| 187 | get_task_struct(tsk); | 186 | get_task_struct(tsk); |
| 188 | *per_cpu_ptr(ht->store, cpu) = tsk; | 187 | *per_cpu_ptr(ht->store, cpu) = tsk; |
| 188 | if (ht->create) | ||
| 189 | ht->create(cpu); | ||
| 189 | return 0; | 190 | return 0; |
| 190 | } | 191 | } |
| 191 | 192 | ||
| @@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | |||
| 225 | { | 226 | { |
| 226 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | 227 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); |
| 227 | 228 | ||
| 228 | if (tsk) | 229 | if (tsk && !ht->selfparking) |
| 229 | kthread_park(tsk); | 230 | kthread_park(tsk); |
| 230 | } | 231 | } |
| 231 | 232 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe789..b4d252fd195b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip) | |||
| 195 | EXPORT_SYMBOL(local_bh_enable_ip); | 195 | EXPORT_SYMBOL(local_bh_enable_ip); |
| 196 | 196 | ||
| 197 | /* | 197 | /* |
| 198 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, | 198 | * We restart softirq processing for at most 2 ms, |
| 199 | * and we fall back to softirqd after that. | 199 | * and if need_resched() is not set. |
| 200 | * | 200 | * |
| 201 | * This number has been established via experimentation. | 201 | * These limits have been established via experimentation. |
| 202 | * The two things to balance is latency against fairness - | 202 | * The two things to balance is latency against fairness - |
| 203 | * we want to handle softirqs as soon as possible, but they | 203 | * we want to handle softirqs as soon as possible, but they |
| 204 | * should not be able to lock up the box. | 204 | * should not be able to lock up the box. |
| 205 | */ | 205 | */ |
| 206 | #define MAX_SOFTIRQ_RESTART 10 | 206 | #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) |
| 207 | 207 | ||
| 208 | asmlinkage void __do_softirq(void) | 208 | asmlinkage void __do_softirq(void) |
| 209 | { | 209 | { |
| 210 | struct softirq_action *h; | 210 | struct softirq_action *h; |
| 211 | __u32 pending; | 211 | __u32 pending; |
| 212 | int max_restart = MAX_SOFTIRQ_RESTART; | 212 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
| 213 | int cpu; | 213 | int cpu; |
| 214 | unsigned long old_flags = current->flags; | 214 | unsigned long old_flags = current->flags; |
| 215 | 215 | ||
| @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
| 221 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
| 222 | 222 | ||
| 223 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
| 224 | vtime_account_irq_enter(current); | 224 | account_irq_enter_time(current); |
| 225 | 225 | ||
| 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
| 227 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
| @@ -264,15 +264,16 @@ restart: | |||
| 264 | local_irq_disable(); | 264 | local_irq_disable(); |
| 265 | 265 | ||
| 266 | pending = local_softirq_pending(); | 266 | pending = local_softirq_pending(); |
| 267 | if (pending && --max_restart) | 267 | if (pending) { |
| 268 | goto restart; | 268 | if (time_before(jiffies, end) && !need_resched()) |
| 269 | goto restart; | ||
| 269 | 270 | ||
| 270 | if (pending) | ||
| 271 | wakeup_softirqd(); | 271 | wakeup_softirqd(); |
| 272 | } | ||
| 272 | 273 | ||
| 273 | lockdep_softirq_exit(); | 274 | lockdep_softirq_exit(); |
| 274 | 275 | ||
| 275 | vtime_account_irq_exit(current); | 276 | account_irq_exit_time(current); |
| 276 | __local_bh_enable(SOFTIRQ_OFFSET); | 277 | __local_bh_enable(SOFTIRQ_OFFSET); |
| 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 278 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
| 278 | } | 279 | } |
| @@ -341,7 +342,7 @@ static inline void invoke_softirq(void) | |||
| 341 | */ | 342 | */ |
| 342 | void irq_exit(void) | 343 | void irq_exit(void) |
| 343 | { | 344 | { |
| 344 | vtime_account_irq_exit(current); | 345 | account_irq_exit_time(current); |
| 345 | trace_hardirq_exit(); | 346 | trace_hardirq_exit(); |
| 346 | sub_preempt_count(IRQ_EXIT_OFFSET); | 347 | sub_preempt_count(IRQ_EXIT_OFFSET); |
| 347 | if (!in_interrupt() && local_softirq_pending()) | 348 | if (!in_interrupt() && local_softirq_pending()) |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2b859828cdc3..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
| @@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp) | |||
| 282 | */ | 282 | */ |
| 283 | void cleanup_srcu_struct(struct srcu_struct *sp) | 283 | void cleanup_srcu_struct(struct srcu_struct *sp) |
| 284 | { | 284 | { |
| 285 | int sum; | 285 | if (WARN_ON(srcu_readers_active(sp))) |
| 286 | 286 | return; /* Leakage unless caller handles error. */ | |
| 287 | sum = srcu_readers_active(sp); | ||
| 288 | WARN_ON(sum); /* Leakage unless caller handles error. */ | ||
| 289 | if (sum != 0) | ||
| 290 | return; | ||
| 291 | free_percpu(sp->per_cpu_ref); | 287 | free_percpu(sp->per_cpu_ref); |
| 292 | sp->per_cpu_ref = NULL; | 288 | sp->per_cpu_ref = NULL; |
| 293 | } | 289 | } |
| @@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
| 302 | { | 298 | { |
| 303 | int idx; | 299 | int idx; |
| 304 | 300 | ||
| 301 | idx = ACCESS_ONCE(sp->completed) & 0x1; | ||
| 305 | preempt_disable(); | 302 | preempt_disable(); |
| 306 | idx = rcu_dereference_index_check(sp->completed, | ||
| 307 | rcu_read_lock_sched_held()) & 0x1; | ||
| 308 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; | 303 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
| 309 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | 304 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
| 310 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; |
| @@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
| 321 | */ | 316 | */ |
| 322 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 317 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
| 323 | { | 318 | { |
| 324 | preempt_disable(); | ||
| 325 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ | 319 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
| 326 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; | 320 | this_cpu_dec(sp->per_cpu_ref->c[idx]); |
| 327 | preempt_enable(); | ||
| 328 | } | 321 | } |
| 329 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 322 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
| 330 | 323 | ||
| @@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
| 423 | !lock_is_held(&rcu_sched_lock_map), | 416 | !lock_is_held(&rcu_sched_lock_map), |
| 424 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 417 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); |
| 425 | 418 | ||
| 419 | might_sleep(); | ||
| 426 | init_completion(&rcu.completion); | 420 | init_completion(&rcu.completion); |
| 427 | 421 | ||
| 428 | head->next = NULL; | 422 | head->next = NULL; |
| @@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
| 455 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | 449 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion |
| 456 | * @sp: srcu_struct with which to synchronize. | 450 | * @sp: srcu_struct with which to synchronize. |
| 457 | * | 451 | * |
| 458 | * Flip the completed counter, and wait for the old count to drain to zero. | 452 | * Wait for the count to drain to zero of both indexes. To avoid the |
| 459 | * As with classic RCU, the updater must use some separate means of | 453 | * possible starvation of synchronize_srcu(), it waits for the count of |
| 460 | * synchronizing concurrent updates. Can block; must be called from | 454 | * the index=((->completed & 1) ^ 1) to drain to zero at first, |
| 461 | * process context. | 455 | * and then flip the completed and wait for the count of the other index. |
| 456 | * | ||
| 457 | * Can block; must be called from process context. | ||
| 462 | * | 458 | * |
| 463 | * Note that it is illegal to call synchronize_srcu() from the corresponding | 459 | * Note that it is illegal to call synchronize_srcu() from the corresponding |
| 464 | * SRCU read-side critical section; doing so will result in deadlock. | 460 | * SRCU read-side critical section; doing so will result in deadlock. |
| @@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
| 480 | * Wait for an SRCU grace period to elapse, but be more aggressive about | 476 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
| 481 | * spinning rather than blocking when waiting. | 477 | * spinning rather than blocking when waiting. |
| 482 | * | 478 | * |
| 483 | * Note that it is illegal to call this function while holding any lock | 479 | * Note that it is also illegal to call synchronize_srcu_expedited() |
| 484 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call | 480 | * from the corresponding SRCU read-side critical section; |
| 485 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 481 | * doing so will result in deadlock. However, it is perfectly legal |
| 486 | * critical section; doing so will result in deadlock. However, it is | 482 | * to call synchronize_srcu_expedited() on one srcu_struct from some |
| 487 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 483 | * other srcu_struct's read-side critical section, as long as |
| 488 | * from some other srcu_struct's read-side critical section, as long as | ||
| 489 | * the resulting graph of srcu_structs is acyclic. | 484 | * the resulting graph of srcu_structs is acyclic. |
| 490 | */ | 485 | */ |
| 491 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2f194e965715..95d178c62d5a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | #include <linux/stop_machine.h> | 18 | #include <linux/stop_machine.h> |
| 19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
| 20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
| 21 | 21 | #include <linux/smpboot.h> | |
| 22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
| 23 | 23 | ||
| 24 | /* | 24 | /* |
| @@ -37,10 +37,10 @@ struct cpu_stopper { | |||
| 37 | spinlock_t lock; | 37 | spinlock_t lock; |
| 38 | bool enabled; /* is this stopper enabled? */ | 38 | bool enabled; /* is this stopper enabled? */ |
| 39 | struct list_head works; /* list of pending works */ | 39 | struct list_head works; /* list of pending works */ |
| 40 | struct task_struct *thread; /* stopper thread */ | ||
| 41 | }; | 40 | }; |
| 42 | 41 | ||
| 43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 42 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
| 43 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); | ||
| 44 | static bool stop_machine_initialized = false; | 44 | static bool stop_machine_initialized = false; |
| 45 | 45 | ||
| 46 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 46 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
| @@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) | |||
| 62 | } | 62 | } |
| 63 | 63 | ||
| 64 | /* queue @work to @stopper. if offline, @work is completed immediately */ | 64 | /* queue @work to @stopper. if offline, @work is completed immediately */ |
| 65 | static void cpu_stop_queue_work(struct cpu_stopper *stopper, | 65 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) |
| 66 | struct cpu_stop_work *work) | ||
| 67 | { | 66 | { |
| 67 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | ||
| 68 | struct task_struct *p = per_cpu(cpu_stopper_task, cpu); | ||
| 69 | |||
| 68 | unsigned long flags; | 70 | unsigned long flags; |
| 69 | 71 | ||
| 70 | spin_lock_irqsave(&stopper->lock, flags); | 72 | spin_lock_irqsave(&stopper->lock, flags); |
| 71 | 73 | ||
| 72 | if (stopper->enabled) { | 74 | if (stopper->enabled) { |
| 73 | list_add_tail(&work->list, &stopper->works); | 75 | list_add_tail(&work->list, &stopper->works); |
| 74 | wake_up_process(stopper->thread); | 76 | wake_up_process(p); |
| 75 | } else | 77 | } else |
| 76 | cpu_stop_signal_done(work->done, false); | 78 | cpu_stop_signal_done(work->done, false); |
| 77 | 79 | ||
| @@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
| 108 | struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; | 110 | struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; |
| 109 | 111 | ||
| 110 | cpu_stop_init_done(&done, 1); | 112 | cpu_stop_init_done(&done, 1); |
| 111 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); | 113 | cpu_stop_queue_work(cpu, &work); |
| 112 | wait_for_completion(&done.completion); | 114 | wait_for_completion(&done.completion); |
| 113 | return done.executed ? done.ret : -ENOENT; | 115 | return done.executed ? done.ret : -ENOENT; |
| 114 | } | 116 | } |
| @@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
| 130 | struct cpu_stop_work *work_buf) | 132 | struct cpu_stop_work *work_buf) |
| 131 | { | 133 | { |
| 132 | *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; | 134 | *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; |
| 133 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); | 135 | cpu_stop_queue_work(cpu, work_buf); |
| 134 | } | 136 | } |
| 135 | 137 | ||
| 136 | /* static data for stop_cpus */ | 138 | /* static data for stop_cpus */ |
| @@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, | |||
| 159 | */ | 161 | */ |
| 160 | preempt_disable(); | 162 | preempt_disable(); |
| 161 | for_each_cpu(cpu, cpumask) | 163 | for_each_cpu(cpu, cpumask) |
| 162 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), | 164 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); |
| 163 | &per_cpu(stop_cpus_work, cpu)); | ||
| 164 | preempt_enable(); | 165 | preempt_enable(); |
| 165 | } | 166 | } |
| 166 | 167 | ||
| @@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |||
| 244 | return ret; | 245 | return ret; |
| 245 | } | 246 | } |
| 246 | 247 | ||
| 247 | static int cpu_stopper_thread(void *data) | 248 | static int cpu_stop_should_run(unsigned int cpu) |
| 249 | { | ||
| 250 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | ||
| 251 | unsigned long flags; | ||
| 252 | int run; | ||
| 253 | |||
| 254 | spin_lock_irqsave(&stopper->lock, flags); | ||
| 255 | run = !list_empty(&stopper->works); | ||
| 256 | spin_unlock_irqrestore(&stopper->lock, flags); | ||
| 257 | return run; | ||
| 258 | } | ||
| 259 | |||
| 260 | static void cpu_stopper_thread(unsigned int cpu) | ||
| 248 | { | 261 | { |
| 249 | struct cpu_stopper *stopper = data; | 262 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
| 250 | struct cpu_stop_work *work; | 263 | struct cpu_stop_work *work; |
| 251 | int ret; | 264 | int ret; |
| 252 | 265 | ||
| 253 | repeat: | 266 | repeat: |
| 254 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | ||
| 255 | |||
| 256 | if (kthread_should_stop()) { | ||
| 257 | __set_current_state(TASK_RUNNING); | ||
| 258 | return 0; | ||
| 259 | } | ||
| 260 | |||
| 261 | work = NULL; | 267 | work = NULL; |
| 262 | spin_lock_irq(&stopper->lock); | 268 | spin_lock_irq(&stopper->lock); |
| 263 | if (!list_empty(&stopper->works)) { | 269 | if (!list_empty(&stopper->works)) { |
| @@ -273,8 +279,6 @@ repeat: | |||
| 273 | struct cpu_stop_done *done = work->done; | 279 | struct cpu_stop_done *done = work->done; |
| 274 | char ksym_buf[KSYM_NAME_LEN] __maybe_unused; | 280 | char ksym_buf[KSYM_NAME_LEN] __maybe_unused; |
| 275 | 281 | ||
| 276 | __set_current_state(TASK_RUNNING); | ||
| 277 | |||
| 278 | /* cpu stop callbacks are not allowed to sleep */ | 282 | /* cpu stop callbacks are not allowed to sleep */ |
| 279 | preempt_disable(); | 283 | preempt_disable(); |
| 280 | 284 | ||
| @@ -290,88 +294,55 @@ repeat: | |||
| 290 | ksym_buf), arg); | 294 | ksym_buf), arg); |
| 291 | 295 | ||
| 292 | cpu_stop_signal_done(done, true); | 296 | cpu_stop_signal_done(done, true); |
| 293 | } else | 297 | goto repeat; |
| 294 | schedule(); | 298 | } |
| 295 | |||
| 296 | goto repeat; | ||
| 297 | } | 299 | } |
| 298 | 300 | ||
| 299 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); | 301 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); |
| 300 | 302 | ||
| 301 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ | 303 | static void cpu_stop_create(unsigned int cpu) |
| 302 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | 304 | { |
| 303 | unsigned long action, void *hcpu) | 305 | sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); |
| 306 | } | ||
| 307 | |||
| 308 | static void cpu_stop_park(unsigned int cpu) | ||
| 304 | { | 309 | { |
| 305 | unsigned int cpu = (unsigned long)hcpu; | ||
| 306 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 310 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
| 307 | struct task_struct *p; | 311 | struct cpu_stop_work *work; |
| 308 | 312 | unsigned long flags; | |
| 309 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 310 | case CPU_UP_PREPARE: | ||
| 311 | BUG_ON(stopper->thread || stopper->enabled || | ||
| 312 | !list_empty(&stopper->works)); | ||
| 313 | p = kthread_create_on_node(cpu_stopper_thread, | ||
| 314 | stopper, | ||
| 315 | cpu_to_node(cpu), | ||
| 316 | "migration/%d", cpu); | ||
| 317 | if (IS_ERR(p)) | ||
| 318 | return notifier_from_errno(PTR_ERR(p)); | ||
| 319 | get_task_struct(p); | ||
| 320 | kthread_bind(p, cpu); | ||
| 321 | sched_set_stop_task(cpu, p); | ||
| 322 | stopper->thread = p; | ||
| 323 | break; | ||
| 324 | |||
| 325 | case CPU_ONLINE: | ||
| 326 | /* strictly unnecessary, as first user will wake it */ | ||
| 327 | wake_up_process(stopper->thread); | ||
| 328 | /* mark enabled */ | ||
| 329 | spin_lock_irq(&stopper->lock); | ||
| 330 | stopper->enabled = true; | ||
| 331 | spin_unlock_irq(&stopper->lock); | ||
| 332 | break; | ||
| 333 | |||
| 334 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 335 | case CPU_UP_CANCELED: | ||
| 336 | case CPU_POST_DEAD: | ||
| 337 | { | ||
| 338 | struct cpu_stop_work *work; | ||
| 339 | |||
| 340 | sched_set_stop_task(cpu, NULL); | ||
| 341 | /* kill the stopper */ | ||
| 342 | kthread_stop(stopper->thread); | ||
| 343 | /* drain remaining works */ | ||
| 344 | spin_lock_irq(&stopper->lock); | ||
| 345 | list_for_each_entry(work, &stopper->works, list) | ||
| 346 | cpu_stop_signal_done(work->done, false); | ||
| 347 | stopper->enabled = false; | ||
| 348 | spin_unlock_irq(&stopper->lock); | ||
| 349 | /* release the stopper */ | ||
| 350 | put_task_struct(stopper->thread); | ||
| 351 | stopper->thread = NULL; | ||
| 352 | break; | ||
| 353 | } | ||
| 354 | #endif | ||
| 355 | } | ||
| 356 | 313 | ||
| 357 | return NOTIFY_OK; | 314 | /* drain remaining works */ |
| 315 | spin_lock_irqsave(&stopper->lock, flags); | ||
| 316 | list_for_each_entry(work, &stopper->works, list) | ||
| 317 | cpu_stop_signal_done(work->done, false); | ||
| 318 | stopper->enabled = false; | ||
| 319 | spin_unlock_irqrestore(&stopper->lock, flags); | ||
| 358 | } | 320 | } |
| 359 | 321 | ||
| 360 | /* | 322 | static void cpu_stop_unpark(unsigned int cpu) |
| 361 | * Give it a higher priority so that cpu stopper is available to other | 323 | { |
| 362 | * cpu notifiers. It currently shares the same priority as sched | 324 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
| 363 | * migration_notifier. | 325 | |
| 364 | */ | 326 | spin_lock_irq(&stopper->lock); |
| 365 | static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { | 327 | stopper->enabled = true; |
| 366 | .notifier_call = cpu_stop_cpu_callback, | 328 | spin_unlock_irq(&stopper->lock); |
| 367 | .priority = 10, | 329 | } |
| 330 | |||
| 331 | static struct smp_hotplug_thread cpu_stop_threads = { | ||
| 332 | .store = &cpu_stopper_task, | ||
| 333 | .thread_should_run = cpu_stop_should_run, | ||
| 334 | .thread_fn = cpu_stopper_thread, | ||
| 335 | .thread_comm = "migration/%u", | ||
| 336 | .create = cpu_stop_create, | ||
| 337 | .setup = cpu_stop_unpark, | ||
| 338 | .park = cpu_stop_park, | ||
| 339 | .unpark = cpu_stop_unpark, | ||
| 340 | .selfparking = true, | ||
| 368 | }; | 341 | }; |
| 369 | 342 | ||
| 370 | static int __init cpu_stop_init(void) | 343 | static int __init cpu_stop_init(void) |
| 371 | { | 344 | { |
| 372 | void *bcpu = (void *)(long)smp_processor_id(); | ||
| 373 | unsigned int cpu; | 345 | unsigned int cpu; |
| 374 | int err; | ||
| 375 | 346 | ||
| 376 | for_each_possible_cpu(cpu) { | 347 | for_each_possible_cpu(cpu) { |
| 377 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 348 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
| @@ -380,15 +351,8 @@ static int __init cpu_stop_init(void) | |||
| 380 | INIT_LIST_HEAD(&stopper->works); | 351 | INIT_LIST_HEAD(&stopper->works); |
| 381 | } | 352 | } |
| 382 | 353 | ||
| 383 | /* start one for the boot cpu */ | 354 | BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); |
| 384 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, | ||
| 385 | bcpu); | ||
| 386 | BUG_ON(err != NOTIFY_OK); | ||
| 387 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); | ||
| 388 | register_cpu_notifier(&cpu_stop_cpu_notifier); | ||
| 389 | |||
| 390 | stop_machine_initialized = true; | 355 | stop_machine_initialized = true; |
| 391 | |||
| 392 | return 0; | 356 | return 0; |
| 393 | } | 357 | } |
| 394 | early_initcall(cpu_stop_init); | 358 | early_initcall(cpu_stop_init); |
diff --git a/kernel/sys.c b/kernel/sys.c index 265b37690421..81f56445fba9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/syscalls.h> | 47 | #include <linux/syscalls.h> |
| 48 | #include <linux/kprobes.h> | 48 | #include <linux/kprobes.h> |
| 49 | #include <linux/user_namespace.h> | 49 | #include <linux/user_namespace.h> |
| 50 | #include <linux/binfmts.h> | ||
| 50 | 51 | ||
| 51 | #include <linux/kmsg_dump.h> | 52 | #include <linux/kmsg_dump.h> |
| 52 | /* Move somewhere else to avoid recompiling? */ | 53 | /* Move somewhere else to avoid recompiling? */ |
| @@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex); | |||
| 433 | SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, | 434 | SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, |
| 434 | void __user *, arg) | 435 | void __user *, arg) |
| 435 | { | 436 | { |
| 437 | struct pid_namespace *pid_ns = task_active_pid_ns(current); | ||
| 436 | char buffer[256]; | 438 | char buffer[256]; |
| 437 | int ret = 0; | 439 | int ret = 0; |
| 438 | 440 | ||
| 439 | /* We only trust the superuser with rebooting the system. */ | 441 | /* We only trust the superuser with rebooting the system. */ |
| 440 | if (!capable(CAP_SYS_BOOT)) | 442 | if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) |
| 441 | return -EPERM; | 443 | return -EPERM; |
| 442 | 444 | ||
| 443 | /* For safety, we require "magic" arguments. */ | 445 | /* For safety, we require "magic" arguments. */ |
| @@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, | |||
| 453 | * pid_namespace, the command is handled by reboot_pid_ns() which will | 455 | * pid_namespace, the command is handled by reboot_pid_ns() which will |
| 454 | * call do_exit(). | 456 | * call do_exit(). |
| 455 | */ | 457 | */ |
| 456 | ret = reboot_pid_ns(task_active_pid_ns(current), cmd); | 458 | ret = reboot_pid_ns(pid_ns, cmd); |
| 457 | if (ret) | 459 | if (ret) |
| 458 | return ret; | 460 | return ret; |
| 459 | 461 | ||
| @@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
| 1792 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1794 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
| 1793 | { | 1795 | { |
| 1794 | struct fd exe; | 1796 | struct fd exe; |
| 1795 | struct dentry *dentry; | 1797 | struct inode *inode; |
| 1796 | int err; | 1798 | int err; |
| 1797 | 1799 | ||
| 1798 | exe = fdget(fd); | 1800 | exe = fdget(fd); |
| 1799 | if (!exe.file) | 1801 | if (!exe.file) |
| 1800 | return -EBADF; | 1802 | return -EBADF; |
| 1801 | 1803 | ||
| 1802 | dentry = exe.file->f_path.dentry; | 1804 | inode = file_inode(exe.file); |
| 1803 | 1805 | ||
| 1804 | /* | 1806 | /* |
| 1805 | * Because the original mm->exe_file points to executable file, make | 1807 | * Because the original mm->exe_file points to executable file, make |
| @@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1807 | * overall picture. | 1809 | * overall picture. |
| 1808 | */ | 1810 | */ |
| 1809 | err = -EACCES; | 1811 | err = -EACCES; |
| 1810 | if (!S_ISREG(dentry->d_inode->i_mode) || | 1812 | if (!S_ISREG(inode->i_mode) || |
| 1811 | exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) | 1813 | exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) |
| 1812 | goto exit; | 1814 | goto exit; |
| 1813 | 1815 | ||
| 1814 | err = inode_permission(dentry->d_inode, MAY_EXEC); | 1816 | err = inode_permission(inode, MAY_EXEC); |
| 1815 | if (err) | 1817 | if (err) |
| 1816 | goto exit; | 1818 | goto exit; |
| 1817 | 1819 | ||
| @@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2012 | 2014 | ||
| 2013 | error = 0; | 2015 | error = 0; |
| 2014 | switch (option) { | 2016 | switch (option) { |
| 2015 | case PR_SET_PDEATHSIG: | 2017 | case PR_SET_PDEATHSIG: |
| 2016 | if (!valid_signal(arg2)) { | 2018 | if (!valid_signal(arg2)) { |
| 2017 | error = -EINVAL; | 2019 | error = -EINVAL; |
| 2018 | break; | ||
| 2019 | } | ||
| 2020 | me->pdeath_signal = arg2; | ||
| 2021 | break; | ||
| 2022 | case PR_GET_PDEATHSIG: | ||
| 2023 | error = put_user(me->pdeath_signal, (int __user *)arg2); | ||
| 2024 | break; | ||
| 2025 | case PR_GET_DUMPABLE: | ||
| 2026 | error = get_dumpable(me->mm); | ||
| 2027 | break; | 2020 | break; |
| 2028 | case PR_SET_DUMPABLE: | 2021 | } |
| 2029 | if (arg2 < 0 || arg2 > 1) { | 2022 | me->pdeath_signal = arg2; |
| 2030 | error = -EINVAL; | 2023 | break; |
| 2031 | break; | 2024 | case PR_GET_PDEATHSIG: |
| 2032 | } | 2025 | error = put_user(me->pdeath_signal, (int __user *)arg2); |
| 2033 | set_dumpable(me->mm, arg2); | 2026 | break; |
| 2027 | case PR_GET_DUMPABLE: | ||
| 2028 | error = get_dumpable(me->mm); | ||
| 2029 | break; | ||
| 2030 | case PR_SET_DUMPABLE: | ||
| 2031 | if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { | ||
| 2032 | error = -EINVAL; | ||
| 2034 | break; | 2033 | break; |
| 2034 | } | ||
| 2035 | set_dumpable(me->mm, arg2); | ||
| 2036 | break; | ||
| 2035 | 2037 | ||
| 2036 | case PR_SET_UNALIGN: | 2038 | case PR_SET_UNALIGN: |
| 2037 | error = SET_UNALIGN_CTL(me, arg2); | 2039 | error = SET_UNALIGN_CTL(me, arg2); |
| 2038 | break; | 2040 | break; |
| 2039 | case PR_GET_UNALIGN: | 2041 | case PR_GET_UNALIGN: |
| 2040 | error = GET_UNALIGN_CTL(me, arg2); | 2042 | error = GET_UNALIGN_CTL(me, arg2); |
| 2041 | break; | 2043 | break; |
| 2042 | case PR_SET_FPEMU: | 2044 | case PR_SET_FPEMU: |
| 2043 | error = SET_FPEMU_CTL(me, arg2); | 2045 | error = SET_FPEMU_CTL(me, arg2); |
| 2044 | break; | 2046 | break; |
| 2045 | case PR_GET_FPEMU: | 2047 | case PR_GET_FPEMU: |
| 2046 | error = GET_FPEMU_CTL(me, arg2); | 2048 | error = GET_FPEMU_CTL(me, arg2); |
| 2047 | break; | 2049 | break; |
| 2048 | case PR_SET_FPEXC: | 2050 | case PR_SET_FPEXC: |
| 2049 | error = SET_FPEXC_CTL(me, arg2); | 2051 | error = SET_FPEXC_CTL(me, arg2); |
| 2050 | break; | 2052 | break; |
| 2051 | case PR_GET_FPEXC: | 2053 | case PR_GET_FPEXC: |
| 2052 | error = GET_FPEXC_CTL(me, arg2); | 2054 | error = GET_FPEXC_CTL(me, arg2); |
| 2053 | break; | 2055 | break; |
| 2054 | case PR_GET_TIMING: | 2056 | case PR_GET_TIMING: |
| 2055 | error = PR_TIMING_STATISTICAL; | 2057 | error = PR_TIMING_STATISTICAL; |
| 2056 | break; | 2058 | break; |
| 2057 | case PR_SET_TIMING: | 2059 | case PR_SET_TIMING: |
| 2058 | if (arg2 != PR_TIMING_STATISTICAL) | 2060 | if (arg2 != PR_TIMING_STATISTICAL) |
| 2059 | error = -EINVAL; | 2061 | error = -EINVAL; |
| 2060 | break; | 2062 | break; |
| 2061 | case PR_SET_NAME: | 2063 | case PR_SET_NAME: |
| 2062 | comm[sizeof(me->comm)-1] = 0; | 2064 | comm[sizeof(me->comm) - 1] = 0; |
| 2063 | if (strncpy_from_user(comm, (char __user *)arg2, | 2065 | if (strncpy_from_user(comm, (char __user *)arg2, |
| 2064 | sizeof(me->comm) - 1) < 0) | 2066 | sizeof(me->comm) - 1) < 0) |
| 2065 | return -EFAULT; | 2067 | return -EFAULT; |
| 2066 | set_task_comm(me, comm); | 2068 | set_task_comm(me, comm); |
| 2067 | proc_comm_connector(me); | 2069 | proc_comm_connector(me); |
| 2068 | break; | 2070 | break; |
| 2069 | case PR_GET_NAME: | 2071 | case PR_GET_NAME: |
| 2070 | get_task_comm(comm, me); | 2072 | get_task_comm(comm, me); |
| 2071 | if (copy_to_user((char __user *)arg2, comm, | 2073 | if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) |
| 2072 | sizeof(comm))) | 2074 | return -EFAULT; |
| 2073 | return -EFAULT; | 2075 | break; |
| 2074 | break; | 2076 | case PR_GET_ENDIAN: |
| 2075 | case PR_GET_ENDIAN: | 2077 | error = GET_ENDIAN(me, arg2); |
| 2076 | error = GET_ENDIAN(me, arg2); | 2078 | break; |
| 2077 | break; | 2079 | case PR_SET_ENDIAN: |
| 2078 | case PR_SET_ENDIAN: | 2080 | error = SET_ENDIAN(me, arg2); |
| 2079 | error = SET_ENDIAN(me, arg2); | 2081 | break; |
| 2080 | break; | 2082 | case PR_GET_SECCOMP: |
| 2081 | case PR_GET_SECCOMP: | 2083 | error = prctl_get_seccomp(); |
| 2082 | error = prctl_get_seccomp(); | 2084 | break; |
| 2083 | break; | 2085 | case PR_SET_SECCOMP: |
| 2084 | case PR_SET_SECCOMP: | 2086 | error = prctl_set_seccomp(arg2, (char __user *)arg3); |
| 2085 | error = prctl_set_seccomp(arg2, (char __user *)arg3); | 2087 | break; |
| 2086 | break; | 2088 | case PR_GET_TSC: |
| 2087 | case PR_GET_TSC: | 2089 | error = GET_TSC_CTL(arg2); |
| 2088 | error = GET_TSC_CTL(arg2); | 2090 | break; |
| 2089 | break; | 2091 | case PR_SET_TSC: |
| 2090 | case PR_SET_TSC: | 2092 | error = SET_TSC_CTL(arg2); |
| 2091 | error = SET_TSC_CTL(arg2); | 2093 | break; |
| 2092 | break; | 2094 | case PR_TASK_PERF_EVENTS_DISABLE: |
| 2093 | case PR_TASK_PERF_EVENTS_DISABLE: | 2095 | error = perf_event_task_disable(); |
| 2094 | error = perf_event_task_disable(); | 2096 | break; |
| 2095 | break; | 2097 | case PR_TASK_PERF_EVENTS_ENABLE: |
| 2096 | case PR_TASK_PERF_EVENTS_ENABLE: | 2098 | error = perf_event_task_enable(); |
| 2097 | error = perf_event_task_enable(); | 2099 | break; |
| 2098 | break; | 2100 | case PR_GET_TIMERSLACK: |
| 2099 | case PR_GET_TIMERSLACK: | 2101 | error = current->timer_slack_ns; |
| 2100 | error = current->timer_slack_ns; | 2102 | break; |
| 2101 | break; | 2103 | case PR_SET_TIMERSLACK: |
| 2102 | case PR_SET_TIMERSLACK: | 2104 | if (arg2 <= 0) |
| 2103 | if (arg2 <= 0) | 2105 | current->timer_slack_ns = |
| 2104 | current->timer_slack_ns = | ||
| 2105 | current->default_timer_slack_ns; | 2106 | current->default_timer_slack_ns; |
| 2106 | else | 2107 | else |
| 2107 | current->timer_slack_ns = arg2; | 2108 | current->timer_slack_ns = arg2; |
| 2108 | break; | 2109 | break; |
| 2109 | case PR_MCE_KILL: | 2110 | case PR_MCE_KILL: |
| 2110 | if (arg4 | arg5) | 2111 | if (arg4 | arg5) |
| 2111 | return -EINVAL; | 2112 | return -EINVAL; |
| 2112 | switch (arg2) { | 2113 | switch (arg2) { |
| 2113 | case PR_MCE_KILL_CLEAR: | 2114 | case PR_MCE_KILL_CLEAR: |
| 2114 | if (arg3 != 0) | 2115 | if (arg3 != 0) |
| 2115 | return -EINVAL; | ||
| 2116 | current->flags &= ~PF_MCE_PROCESS; | ||
| 2117 | break; | ||
| 2118 | case PR_MCE_KILL_SET: | ||
| 2119 | current->flags |= PF_MCE_PROCESS; | ||
| 2120 | if (arg3 == PR_MCE_KILL_EARLY) | ||
| 2121 | current->flags |= PF_MCE_EARLY; | ||
| 2122 | else if (arg3 == PR_MCE_KILL_LATE) | ||
| 2123 | current->flags &= ~PF_MCE_EARLY; | ||
| 2124 | else if (arg3 == PR_MCE_KILL_DEFAULT) | ||
| 2125 | current->flags &= | ||
| 2126 | ~(PF_MCE_EARLY|PF_MCE_PROCESS); | ||
| 2127 | else | ||
| 2128 | return -EINVAL; | ||
| 2129 | break; | ||
| 2130 | default: | ||
| 2131 | return -EINVAL; | 2116 | return -EINVAL; |
| 2132 | } | 2117 | current->flags &= ~PF_MCE_PROCESS; |
| 2133 | break; | 2118 | break; |
| 2134 | case PR_MCE_KILL_GET: | 2119 | case PR_MCE_KILL_SET: |
| 2135 | if (arg2 | arg3 | arg4 | arg5) | 2120 | current->flags |= PF_MCE_PROCESS; |
| 2136 | return -EINVAL; | 2121 | if (arg3 == PR_MCE_KILL_EARLY) |
| 2137 | if (current->flags & PF_MCE_PROCESS) | 2122 | current->flags |= PF_MCE_EARLY; |
| 2138 | error = (current->flags & PF_MCE_EARLY) ? | 2123 | else if (arg3 == PR_MCE_KILL_LATE) |
| 2139 | PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; | 2124 | current->flags &= ~PF_MCE_EARLY; |
| 2125 | else if (arg3 == PR_MCE_KILL_DEFAULT) | ||
| 2126 | current->flags &= | ||
| 2127 | ~(PF_MCE_EARLY|PF_MCE_PROCESS); | ||
| 2140 | else | 2128 | else |
| 2141 | error = PR_MCE_KILL_DEFAULT; | ||
| 2142 | break; | ||
| 2143 | case PR_SET_MM: | ||
| 2144 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | ||
| 2145 | break; | ||
| 2146 | case PR_GET_TID_ADDRESS: | ||
| 2147 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
| 2148 | break; | ||
| 2149 | case PR_SET_CHILD_SUBREAPER: | ||
| 2150 | me->signal->is_child_subreaper = !!arg2; | ||
| 2151 | break; | ||
| 2152 | case PR_GET_CHILD_SUBREAPER: | ||
| 2153 | error = put_user(me->signal->is_child_subreaper, | ||
| 2154 | (int __user *) arg2); | ||
| 2155 | break; | ||
| 2156 | case PR_SET_NO_NEW_PRIVS: | ||
| 2157 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
| 2158 | return -EINVAL; | 2129 | return -EINVAL; |
| 2159 | |||
| 2160 | current->no_new_privs = 1; | ||
| 2161 | break; | 2130 | break; |
| 2162 | case PR_GET_NO_NEW_PRIVS: | ||
| 2163 | if (arg2 || arg3 || arg4 || arg5) | ||
| 2164 | return -EINVAL; | ||
| 2165 | return current->no_new_privs ? 1 : 0; | ||
| 2166 | default: | 2131 | default: |
| 2167 | error = -EINVAL; | 2132 | return -EINVAL; |
| 2168 | break; | 2133 | } |
| 2134 | break; | ||
| 2135 | case PR_MCE_KILL_GET: | ||
| 2136 | if (arg2 | arg3 | arg4 | arg5) | ||
| 2137 | return -EINVAL; | ||
| 2138 | if (current->flags & PF_MCE_PROCESS) | ||
| 2139 | error = (current->flags & PF_MCE_EARLY) ? | ||
| 2140 | PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; | ||
| 2141 | else | ||
| 2142 | error = PR_MCE_KILL_DEFAULT; | ||
| 2143 | break; | ||
| 2144 | case PR_SET_MM: | ||
| 2145 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | ||
| 2146 | break; | ||
| 2147 | case PR_GET_TID_ADDRESS: | ||
| 2148 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
| 2149 | break; | ||
| 2150 | case PR_SET_CHILD_SUBREAPER: | ||
| 2151 | me->signal->is_child_subreaper = !!arg2; | ||
| 2152 | break; | ||
| 2153 | case PR_GET_CHILD_SUBREAPER: | ||
| 2154 | error = put_user(me->signal->is_child_subreaper, | ||
| 2155 | (int __user *)arg2); | ||
| 2156 | break; | ||
| 2157 | case PR_SET_NO_NEW_PRIVS: | ||
| 2158 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
| 2159 | return -EINVAL; | ||
| 2160 | |||
| 2161 | current->no_new_privs = 1; | ||
| 2162 | break; | ||
| 2163 | case PR_GET_NO_NEW_PRIVS: | ||
| 2164 | if (arg2 || arg3 || arg4 || arg5) | ||
| 2165 | return -EINVAL; | ||
| 2166 | return current->no_new_privs ? 1 : 0; | ||
| 2167 | default: | ||
| 2168 | error = -EINVAL; | ||
| 2169 | break; | ||
| 2169 | } | 2170 | } |
| 2170 | return error; | 2171 | return error; |
| 2171 | } | 2172 | } |
| @@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, | |||
| 2184 | 2185 | ||
| 2185 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | 2186 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; |
| 2186 | 2187 | ||
| 2187 | static void argv_cleanup(struct subprocess_info *info) | ||
| 2188 | { | ||
| 2189 | argv_free(info->argv); | ||
| 2190 | } | ||
| 2191 | |||
| 2192 | static int __orderly_poweroff(void) | 2188 | static int __orderly_poweroff(void) |
| 2193 | { | 2189 | { |
| 2194 | int argc; | 2190 | int argc; |
| @@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void) | |||
| 2208 | } | 2204 | } |
| 2209 | 2205 | ||
| 2210 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, | 2206 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, |
| 2211 | NULL, argv_cleanup, NULL); | 2207 | NULL, NULL, NULL); |
| 2212 | if (ret == -ENOMEM) | 2208 | argv_free(argv); |
| 2213 | argv_free(argv); | ||
| 2214 | 2209 | ||
| 2215 | return ret; | 2210 | return ret; |
| 2216 | } | 2211 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..d1b4ee67d2df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -61,6 +61,7 @@ | |||
| 61 | #include <linux/kmod.h> | 61 | #include <linux/kmod.h> |
| 62 | #include <linux/capability.h> | 62 | #include <linux/capability.h> |
| 63 | #include <linux/binfmts.h> | 63 | #include <linux/binfmts.h> |
| 64 | #include <linux/sched/sysctl.h> | ||
| 64 | 65 | ||
| 65 | #include <asm/uaccess.h> | 66 | #include <asm/uaccess.h> |
| 66 | #include <asm/processor.h> | 67 | #include <asm/processor.h> |
| @@ -104,7 +105,6 @@ extern char core_pattern[]; | |||
| 104 | extern unsigned int core_pipe_limit; | 105 | extern unsigned int core_pipe_limit; |
| 105 | #endif | 106 | #endif |
| 106 | extern int pid_max; | 107 | extern int pid_max; |
| 107 | extern int min_free_kbytes; | ||
| 108 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
| 109 | extern int sysctl_drop_caches; | 109 | extern int sysctl_drop_caches; |
| 110 | extern int percpu_pagelist_fraction; | 110 | extern int percpu_pagelist_fraction; |
| @@ -161,10 +161,13 @@ extern int unaligned_enabled; | |||
| 161 | #endif | 161 | #endif |
| 162 | 162 | ||
| 163 | #ifdef CONFIG_IA64 | 163 | #ifdef CONFIG_IA64 |
| 164 | extern int no_unaligned_warning; | ||
| 165 | extern int unaligned_dump_stack; | 164 | extern int unaligned_dump_stack; |
| 166 | #endif | 165 | #endif |
| 167 | 166 | ||
| 167 | #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN | ||
| 168 | extern int no_unaligned_warning; | ||
| 169 | #endif | ||
| 170 | |||
| 168 | #ifdef CONFIG_PROC_SYSCTL | 171 | #ifdef CONFIG_PROC_SYSCTL |
| 169 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 172 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
| 170 | void __user *buffer, size_t *lenp, loff_t *ppos); | 173 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| @@ -403,6 +406,13 @@ static struct ctl_table kern_table[] = { | |||
| 403 | .mode = 0644, | 406 | .mode = 0644, |
| 404 | .proc_handler = sched_rt_handler, | 407 | .proc_handler = sched_rt_handler, |
| 405 | }, | 408 | }, |
| 409 | { | ||
| 410 | .procname = "sched_rr_timeslice_ms", | ||
| 411 | .data = &sched_rr_timeslice, | ||
| 412 | .maxlen = sizeof(int), | ||
| 413 | .mode = 0644, | ||
| 414 | .proc_handler = sched_rr_handler, | ||
| 415 | }, | ||
| 406 | #ifdef CONFIG_SCHED_AUTOGROUP | 416 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 407 | { | 417 | { |
| 408 | .procname = "sched_autogroup_enabled", | 418 | .procname = "sched_autogroup_enabled", |
| @@ -911,7 +921,7 @@ static struct ctl_table kern_table[] = { | |||
| 911 | .proc_handler = proc_doulongvec_minmax, | 921 | .proc_handler = proc_doulongvec_minmax, |
| 912 | }, | 922 | }, |
| 913 | #endif | 923 | #endif |
| 914 | #ifdef CONFIG_IA64 | 924 | #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN |
| 915 | { | 925 | { |
| 916 | .procname = "ignore-unaligned-usertrap", | 926 | .procname = "ignore-unaligned-usertrap", |
| 917 | .data = &no_unaligned_warning, | 927 | .data = &no_unaligned_warning, |
| @@ -919,6 +929,8 @@ static struct ctl_table kern_table[] = { | |||
| 919 | .mode = 0644, | 929 | .mode = 0644, |
| 920 | .proc_handler = proc_dointvec, | 930 | .proc_handler = proc_dointvec, |
| 921 | }, | 931 | }, |
| 932 | #endif | ||
| 933 | #ifdef CONFIG_IA64 | ||
| 922 | { | 934 | { |
| 923 | .procname = "unaligned-dump-stack", | 935 | .procname = "unaligned-dump-stack", |
| 924 | .data = &unaligned_dump_stack, | 936 | .data = &unaligned_dump_stack, |
| @@ -2006,7 +2018,7 @@ static int proc_taint(struct ctl_table *table, int write, | |||
| 2006 | int i; | 2018 | int i; |
| 2007 | for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { | 2019 | for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { |
| 2008 | if ((tmptaint >> i) & 1) | 2020 | if ((tmptaint >> i) & 1) |
| 2009 | add_taint(i); | 2021 | add_taint(i, LOCKDEP_STILL_OK); |
| 2010 | } | 2022 | } |
| 2011 | } | 2023 | } |
| 2012 | 2024 | ||
| @@ -2083,7 +2095,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
| 2083 | static void validate_coredump_safety(void) | 2095 | static void validate_coredump_safety(void) |
| 2084 | { | 2096 | { |
| 2085 | #ifdef CONFIG_COREDUMP | 2097 | #ifdef CONFIG_COREDUMP |
| 2086 | if (suid_dumpable == SUID_DUMPABLE_SAFE && | 2098 | if (suid_dumpable == SUID_DUMP_ROOT && |
| 2087 | core_pattern[0] != '/' && core_pattern[0] != '|') { | 2099 | core_pattern[0] != '/' && core_pattern[0] != '|') { |
| 2088 | printk(KERN_WARNING "Unsafe core_pattern used with "\ | 2100 | printk(KERN_WARNING "Unsafe core_pattern used with "\ |
| 2089 | "suid_dumpable=2. Pipe handler or fully qualified "\ | 2101 | "suid_dumpable=2. Pipe handler or fully qualified "\ |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 5a6384450501..ebf72358e86a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = { | |||
| 387 | { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, | 387 | { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, |
| 388 | { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, | 388 | { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, |
| 389 | { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, | 389 | { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, |
| 390 | { CTL_INT, NET_TCP_ABC, "tcp_abc" }, | ||
| 391 | { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, | 390 | { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, |
| 392 | { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, | 391 | { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, |
| 393 | { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, | 392 | { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, |
| @@ -971,7 +970,6 @@ out: | |||
| 971 | static ssize_t bin_intvec(struct file *file, | 970 | static ssize_t bin_intvec(struct file *file, |
| 972 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 971 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
| 973 | { | 972 | { |
| 974 | mm_segment_t old_fs = get_fs(); | ||
| 975 | ssize_t copied = 0; | 973 | ssize_t copied = 0; |
| 976 | char *buffer; | 974 | char *buffer; |
| 977 | ssize_t result; | 975 | ssize_t result; |
| @@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file, | |||
| 984 | if (oldval && oldlen) { | 982 | if (oldval && oldlen) { |
| 985 | unsigned __user *vec = oldval; | 983 | unsigned __user *vec = oldval; |
| 986 | size_t length = oldlen / sizeof(*vec); | 984 | size_t length = oldlen / sizeof(*vec); |
| 987 | loff_t pos = 0; | ||
| 988 | char *str, *end; | 985 | char *str, *end; |
| 989 | int i; | 986 | int i; |
| 990 | 987 | ||
| 991 | set_fs(KERNEL_DS); | 988 | result = kernel_read(file, 0, buffer, BUFSZ - 1); |
| 992 | result = vfs_read(file, buffer, BUFSZ - 1, &pos); | ||
| 993 | set_fs(old_fs); | ||
| 994 | if (result < 0) | 989 | if (result < 0) |
| 995 | goto out_kfree; | 990 | goto out_kfree; |
| 996 | 991 | ||
| @@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file, | |||
| 1017 | if (newval && newlen) { | 1012 | if (newval && newlen) { |
| 1018 | unsigned __user *vec = newval; | 1013 | unsigned __user *vec = newval; |
| 1019 | size_t length = newlen / sizeof(*vec); | 1014 | size_t length = newlen / sizeof(*vec); |
| 1020 | loff_t pos = 0; | ||
| 1021 | char *str, *end; | 1015 | char *str, *end; |
| 1022 | int i; | 1016 | int i; |
| 1023 | 1017 | ||
| @@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file, | |||
| 1033 | str += snprintf(str, end - str, "%lu\t", value); | 1027 | str += snprintf(str, end - str, "%lu\t", value); |
| 1034 | } | 1028 | } |
| 1035 | 1029 | ||
| 1036 | set_fs(KERNEL_DS); | 1030 | result = kernel_write(file, buffer, str - buffer, 0); |
| 1037 | result = vfs_write(file, buffer, str - buffer, &pos); | ||
| 1038 | set_fs(old_fs); | ||
| 1039 | if (result < 0) | 1031 | if (result < 0) |
| 1040 | goto out_kfree; | 1032 | goto out_kfree; |
| 1041 | } | 1033 | } |
| @@ -1049,7 +1041,6 @@ out: | |||
| 1049 | static ssize_t bin_ulongvec(struct file *file, | 1041 | static ssize_t bin_ulongvec(struct file *file, |
| 1050 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 1042 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
| 1051 | { | 1043 | { |
| 1052 | mm_segment_t old_fs = get_fs(); | ||
| 1053 | ssize_t copied = 0; | 1044 | ssize_t copied = 0; |
| 1054 | char *buffer; | 1045 | char *buffer; |
| 1055 | ssize_t result; | 1046 | ssize_t result; |
| @@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file, | |||
| 1062 | if (oldval && oldlen) { | 1053 | if (oldval && oldlen) { |
| 1063 | unsigned long __user *vec = oldval; | 1054 | unsigned long __user *vec = oldval; |
| 1064 | size_t length = oldlen / sizeof(*vec); | 1055 | size_t length = oldlen / sizeof(*vec); |
| 1065 | loff_t pos = 0; | ||
| 1066 | char *str, *end; | 1056 | char *str, *end; |
| 1067 | int i; | 1057 | int i; |
| 1068 | 1058 | ||
| 1069 | set_fs(KERNEL_DS); | 1059 | result = kernel_read(file, 0, buffer, BUFSZ - 1); |
| 1070 | result = vfs_read(file, buffer, BUFSZ - 1, &pos); | ||
| 1071 | set_fs(old_fs); | ||
| 1072 | if (result < 0) | 1060 | if (result < 0) |
| 1073 | goto out_kfree; | 1061 | goto out_kfree; |
| 1074 | 1062 | ||
| @@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file, | |||
| 1095 | if (newval && newlen) { | 1083 | if (newval && newlen) { |
| 1096 | unsigned long __user *vec = newval; | 1084 | unsigned long __user *vec = newval; |
| 1097 | size_t length = newlen / sizeof(*vec); | 1085 | size_t length = newlen / sizeof(*vec); |
| 1098 | loff_t pos = 0; | ||
| 1099 | char *str, *end; | 1086 | char *str, *end; |
| 1100 | int i; | 1087 | int i; |
| 1101 | 1088 | ||
| @@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file, | |||
| 1111 | str += snprintf(str, end - str, "%lu\t", value); | 1098 | str += snprintf(str, end - str, "%lu\t", value); |
| 1112 | } | 1099 | } |
| 1113 | 1100 | ||
| 1114 | set_fs(KERNEL_DS); | 1101 | result = kernel_write(file, buffer, str - buffer, 0); |
| 1115 | result = vfs_write(file, buffer, str - buffer, &pos); | ||
| 1116 | set_fs(old_fs); | ||
| 1117 | if (result < 0) | 1102 | if (result < 0) |
| 1118 | goto out_kfree; | 1103 | goto out_kfree; |
| 1119 | } | 1104 | } |
| @@ -1127,19 +1112,15 @@ out: | |||
| 1127 | static ssize_t bin_uuid(struct file *file, | 1112 | static ssize_t bin_uuid(struct file *file, |
| 1128 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 1113 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
| 1129 | { | 1114 | { |
| 1130 | mm_segment_t old_fs = get_fs(); | ||
| 1131 | ssize_t result, copied = 0; | 1115 | ssize_t result, copied = 0; |
| 1132 | 1116 | ||
| 1133 | /* Only supports reads */ | 1117 | /* Only supports reads */ |
| 1134 | if (oldval && oldlen) { | 1118 | if (oldval && oldlen) { |
| 1135 | loff_t pos = 0; | ||
| 1136 | char buf[40], *str = buf; | 1119 | char buf[40], *str = buf; |
| 1137 | unsigned char uuid[16]; | 1120 | unsigned char uuid[16]; |
| 1138 | int i; | 1121 | int i; |
| 1139 | 1122 | ||
| 1140 | set_fs(KERNEL_DS); | 1123 | result = kernel_read(file, 0, buf, sizeof(buf) - 1); |
| 1141 | result = vfs_read(file, buf, sizeof(buf) - 1, &pos); | ||
| 1142 | set_fs(old_fs); | ||
| 1143 | if (result < 0) | 1124 | if (result < 0) |
| 1144 | goto out; | 1125 | goto out; |
| 1145 | 1126 | ||
| @@ -1175,18 +1156,14 @@ out: | |||
| 1175 | static ssize_t bin_dn_node_address(struct file *file, | 1156 | static ssize_t bin_dn_node_address(struct file *file, |
| 1176 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 1157 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
| 1177 | { | 1158 | { |
| 1178 | mm_segment_t old_fs = get_fs(); | ||
| 1179 | ssize_t result, copied = 0; | 1159 | ssize_t result, copied = 0; |
| 1180 | 1160 | ||
| 1181 | if (oldval && oldlen) { | 1161 | if (oldval && oldlen) { |
| 1182 | loff_t pos = 0; | ||
| 1183 | char buf[15], *nodep; | 1162 | char buf[15], *nodep; |
| 1184 | unsigned long area, node; | 1163 | unsigned long area, node; |
| 1185 | __le16 dnaddr; | 1164 | __le16 dnaddr; |
| 1186 | 1165 | ||
| 1187 | set_fs(KERNEL_DS); | 1166 | result = kernel_read(file, 0, buf, sizeof(buf) - 1); |
| 1188 | result = vfs_read(file, buf, sizeof(buf) - 1, &pos); | ||
| 1189 | set_fs(old_fs); | ||
| 1190 | if (result < 0) | 1167 | if (result < 0) |
| 1191 | goto out; | 1168 | goto out; |
| 1192 | 1169 | ||
| @@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
| 1194 | 1171 | ||
| 1195 | /* Convert the decnet address to binary */ | 1172 | /* Convert the decnet address to binary */ |
| 1196 | result = -EIO; | 1173 | result = -EIO; |
| 1197 | nodep = strchr(buf, '.') + 1; | 1174 | nodep = strchr(buf, '.'); |
| 1198 | if (!nodep) | 1175 | if (!nodep) |
| 1199 | goto out; | 1176 | goto out; |
| 1177 | ++nodep; | ||
| 1200 | 1178 | ||
| 1201 | area = simple_strtoul(buf, NULL, 10); | 1179 | area = simple_strtoul(buf, NULL, 10); |
| 1202 | node = simple_strtoul(nodep, NULL, 10); | 1180 | node = simple_strtoul(nodep, NULL, 10); |
| @@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
| 1215 | } | 1193 | } |
| 1216 | 1194 | ||
| 1217 | if (newval && newlen) { | 1195 | if (newval && newlen) { |
| 1218 | loff_t pos = 0; | ||
| 1219 | __le16 dnaddr; | 1196 | __le16 dnaddr; |
| 1220 | char buf[15]; | 1197 | char buf[15]; |
| 1221 | int len; | 1198 | int len; |
| @@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
| 1232 | le16_to_cpu(dnaddr) >> 10, | 1209 | le16_to_cpu(dnaddr) >> 10, |
| 1233 | le16_to_cpu(dnaddr) & 0x3ff); | 1210 | le16_to_cpu(dnaddr) & 0x3ff); |
| 1234 | 1211 | ||
| 1235 | set_fs(KERNEL_DS); | 1212 | result = kernel_write(file, buf, len, 0); |
| 1236 | result = vfs_write(file, buf, len, &pos); | ||
| 1237 | set_fs(old_fs); | ||
| 1238 | if (result < 0) | 1213 | if (result < 0) |
| 1239 | goto out; | 1214 | goto out; |
| 1240 | } | 1215 | } |
diff --git a/kernel/time.c b/kernel/time.c index d226c6a3fd28..f8342a41efa6 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, | |||
| 115 | } | 115 | } |
| 116 | 116 | ||
| 117 | /* | 117 | /* |
| 118 | * Indicates if there is an offset between the system clock and the hardware | ||
| 119 | * clock/persistent clock/rtc. | ||
| 120 | */ | ||
| 121 | int persistent_clock_is_local; | ||
| 122 | |||
| 123 | /* | ||
| 118 | * Adjust the time obtained from the CMOS to be UTC time instead of | 124 | * Adjust the time obtained from the CMOS to be UTC time instead of |
| 119 | * local time. | 125 | * local time. |
| 120 | * | 126 | * |
| @@ -135,6 +141,8 @@ static inline void warp_clock(void) | |||
| 135 | struct timespec adjust; | 141 | struct timespec adjust; |
| 136 | 142 | ||
| 137 | adjust = current_kernel_time(); | 143 | adjust = current_kernel_time(); |
| 144 | if (sys_tz.tz_minuteswest != 0) | ||
| 145 | persistent_clock_is_local = 1; | ||
| 138 | adjust.tv_sec += sys_tz.tz_minuteswest * 60; | 146 | adjust.tv_sec += sys_tz.tz_minuteswest * 60; |
| 139 | do_settimeofday(&adjust); | 147 | do_settimeofday(&adjust); |
| 140 | } | 148 | } |
| @@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time); | |||
| 232 | * Avoid unnecessary multiplications/divisions in the | 240 | * Avoid unnecessary multiplications/divisions in the |
| 233 | * two most common HZ cases: | 241 | * two most common HZ cases: |
| 234 | */ | 242 | */ |
| 235 | inline unsigned int jiffies_to_msecs(const unsigned long j) | 243 | unsigned int jiffies_to_msecs(const unsigned long j) |
| 236 | { | 244 | { |
| 237 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | 245 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) |
| 238 | return (MSEC_PER_SEC / HZ) * j; | 246 | return (MSEC_PER_SEC / HZ) * j; |
| @@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j) | |||
| 248 | } | 256 | } |
| 249 | EXPORT_SYMBOL(jiffies_to_msecs); | 257 | EXPORT_SYMBOL(jiffies_to_msecs); |
| 250 | 258 | ||
| 251 | inline unsigned int jiffies_to_usecs(const unsigned long j) | 259 | unsigned int jiffies_to_usecs(const unsigned long j) |
| 252 | { | 260 | { |
| 253 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | 261 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) |
| 254 | return (USEC_PER_SEC / HZ) * j; | 262 | return (USEC_PER_SEC / HZ) * j; |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8601f0db1261..24510d84efd7 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG | |||
| 12 | config ARCH_CLOCKSOURCE_DATA | 12 | config ARCH_CLOCKSOURCE_DATA |
| 13 | bool | 13 | bool |
| 14 | 14 | ||
| 15 | # Platforms has a persistent clock | ||
| 16 | config ALWAYS_USE_PERSISTENT_CLOCK | ||
| 17 | bool | ||
| 18 | default n | ||
| 19 | |||
| 15 | # Timekeeping vsyscall support | 20 | # Timekeeping vsyscall support |
| 16 | config GENERIC_TIME_VSYSCALL | 21 | config GENERIC_TIME_VSYSCALL |
| 17 | bool | 22 | bool |
| @@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD | |||
| 38 | default y | 43 | default y |
| 39 | depends on GENERIC_CLOCKEVENTS | 44 | depends on GENERIC_CLOCKEVENTS |
| 40 | 45 | ||
| 46 | # Architecture can handle broadcast in a driver-agnostic way | ||
| 47 | config ARCH_HAS_TICK_BROADCAST | ||
| 48 | bool | ||
| 49 | |||
| 41 | # Clockevents broadcasting infrastructure | 50 | # Clockevents broadcasting infrastructure |
| 42 | config GENERIC_CLOCKEVENTS_BROADCAST | 51 | config GENERIC_CLOCKEVENTS_BROADCAST |
| 43 | bool | 52 | bool |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 30b6de0d977c..c6d6400ee137 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev, | |||
| 339 | clockevents_config(dev, freq); | 339 | clockevents_config(dev, freq); |
| 340 | clockevents_register_device(dev); | 340 | clockevents_register_device(dev); |
| 341 | } | 341 | } |
| 342 | EXPORT_SYMBOL_GPL(clockevents_config_and_register); | ||
| 342 | 343 | ||
| 343 | /** | 344 | /** |
| 344 | * clockevents_update_freq - Update frequency and reprogram a clock event device. | 345 | * clockevents_update_freq - Update frequency and reprogram a clock event device. |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 24174b4d669b..072bb066bb7d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
| 16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
| 17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
| 18 | #include <linux/rtc.h> | ||
| 18 | 19 | ||
| 19 | #include "tick-internal.h" | 20 | #include "tick-internal.h" |
| 20 | 21 | ||
| @@ -22,7 +23,7 @@ | |||
| 22 | * NTP timekeeping variables: | 23 | * NTP timekeeping variables: |
| 23 | */ | 24 | */ |
| 24 | 25 | ||
| 25 | DEFINE_SPINLOCK(ntp_lock); | 26 | DEFINE_RAW_SPINLOCK(ntp_lock); |
| 26 | 27 | ||
| 27 | 28 | ||
| 28 | /* USER_HZ period (usecs): */ | 29 | /* USER_HZ period (usecs): */ |
| @@ -347,7 +348,7 @@ void ntp_clear(void) | |||
| 347 | { | 348 | { |
| 348 | unsigned long flags; | 349 | unsigned long flags; |
| 349 | 350 | ||
| 350 | spin_lock_irqsave(&ntp_lock, flags); | 351 | raw_spin_lock_irqsave(&ntp_lock, flags); |
| 351 | 352 | ||
| 352 | time_adjust = 0; /* stop active adjtime() */ | 353 | time_adjust = 0; /* stop active adjtime() */ |
| 353 | time_status |= STA_UNSYNC; | 354 | time_status |= STA_UNSYNC; |
| @@ -361,7 +362,7 @@ void ntp_clear(void) | |||
| 361 | 362 | ||
| 362 | /* Clear PPS state variables */ | 363 | /* Clear PPS state variables */ |
| 363 | pps_clear(); | 364 | pps_clear(); |
| 364 | spin_unlock_irqrestore(&ntp_lock, flags); | 365 | raw_spin_unlock_irqrestore(&ntp_lock, flags); |
| 365 | 366 | ||
| 366 | } | 367 | } |
| 367 | 368 | ||
| @@ -371,9 +372,9 @@ u64 ntp_tick_length(void) | |||
| 371 | unsigned long flags; | 372 | unsigned long flags; |
| 372 | s64 ret; | 373 | s64 ret; |
| 373 | 374 | ||
| 374 | spin_lock_irqsave(&ntp_lock, flags); | 375 | raw_spin_lock_irqsave(&ntp_lock, flags); |
| 375 | ret = tick_length; | 376 | ret = tick_length; |
| 376 | spin_unlock_irqrestore(&ntp_lock, flags); | 377 | raw_spin_unlock_irqrestore(&ntp_lock, flags); |
| 377 | return ret; | 378 | return ret; |
| 378 | } | 379 | } |
| 379 | 380 | ||
| @@ -394,7 +395,7 @@ int second_overflow(unsigned long secs) | |||
| 394 | int leap = 0; | 395 | int leap = 0; |
| 395 | unsigned long flags; | 396 | unsigned long flags; |
| 396 | 397 | ||
| 397 | spin_lock_irqsave(&ntp_lock, flags); | 398 | raw_spin_lock_irqsave(&ntp_lock, flags); |
| 398 | 399 | ||
| 399 | /* | 400 | /* |
| 400 | * Leap second processing. If in leap-insert state at the end of the | 401 | * Leap second processing. If in leap-insert state at the end of the |
| @@ -478,13 +479,12 @@ int second_overflow(unsigned long secs) | |||
| 478 | time_adjust = 0; | 479 | time_adjust = 0; |
| 479 | 480 | ||
| 480 | out: | 481 | out: |
| 481 | spin_unlock_irqrestore(&ntp_lock, flags); | 482 | raw_spin_unlock_irqrestore(&ntp_lock, flags); |
| 482 | 483 | ||
| 483 | return leap; | 484 | return leap; |
| 484 | } | 485 | } |
| 485 | 486 | ||
| 486 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 487 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) |
| 487 | |||
| 488 | static void sync_cmos_clock(struct work_struct *work); | 488 | static void sync_cmos_clock(struct work_struct *work); |
| 489 | 489 | ||
| 490 | static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); | 490 | static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); |
| @@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 510 | } | 510 | } |
| 511 | 511 | ||
| 512 | getnstimeofday(&now); | 512 | getnstimeofday(&now); |
| 513 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) | 513 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { |
| 514 | fail = update_persistent_clock(now); | 514 | struct timespec adjust = now; |
| 515 | |||
| 516 | fail = -ENODEV; | ||
| 517 | if (persistent_clock_is_local) | ||
| 518 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); | ||
| 519 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | ||
| 520 | fail = update_persistent_clock(adjust); | ||
| 521 | #endif | ||
| 522 | #ifdef CONFIG_RTC_SYSTOHC | ||
| 523 | if (fail == -ENODEV) | ||
| 524 | fail = rtc_set_ntp_time(adjust); | ||
| 525 | #endif | ||
| 526 | } | ||
| 515 | 527 | ||
| 516 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); | 528 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); |
| 517 | if (next.tv_nsec <= 0) | 529 | if (next.tv_nsec <= 0) |
| 518 | next.tv_nsec += NSEC_PER_SEC; | 530 | next.tv_nsec += NSEC_PER_SEC; |
| 519 | 531 | ||
| 520 | if (!fail) | 532 | if (!fail || fail == -ENODEV) |
| 521 | next.tv_sec = 659; | 533 | next.tv_sec = 659; |
| 522 | else | 534 | else |
| 523 | next.tv_sec = 0; | 535 | next.tv_sec = 0; |
| @@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc) | |||
| 660 | 672 | ||
| 661 | getnstimeofday(&ts); | 673 | getnstimeofday(&ts); |
| 662 | 674 | ||
| 663 | spin_lock_irq(&ntp_lock); | 675 | raw_spin_lock_irq(&ntp_lock); |
| 664 | 676 | ||
| 665 | if (txc->modes & ADJ_ADJTIME) { | 677 | if (txc->modes & ADJ_ADJTIME) { |
| 666 | long save_adjust = time_adjust; | 678 | long save_adjust = time_adjust; |
| @@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc) | |||
| 702 | /* fill PPS status fields */ | 714 | /* fill PPS status fields */ |
| 703 | pps_fill_timex(txc); | 715 | pps_fill_timex(txc); |
| 704 | 716 | ||
| 705 | spin_unlock_irq(&ntp_lock); | 717 | raw_spin_unlock_irq(&ntp_lock); |
| 706 | 718 | ||
| 707 | txc->time.tv_sec = ts.tv_sec; | 719 | txc->time.tv_sec = ts.tv_sec; |
| 708 | txc->time.tv_usec = ts.tv_nsec; | 720 | txc->time.tv_usec = ts.tv_nsec; |
| @@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
| 900 | 912 | ||
| 901 | pts_norm = pps_normalize_ts(*phase_ts); | 913 | pts_norm = pps_normalize_ts(*phase_ts); |
| 902 | 914 | ||
| 903 | spin_lock_irqsave(&ntp_lock, flags); | 915 | raw_spin_lock_irqsave(&ntp_lock, flags); |
| 904 | 916 | ||
| 905 | /* clear the error bits, they will be set again if needed */ | 917 | /* clear the error bits, they will be set again if needed */ |
| 906 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | 918 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); |
| @@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
| 913 | * just start the frequency interval */ | 925 | * just start the frequency interval */ |
| 914 | if (unlikely(pps_fbase.tv_sec == 0)) { | 926 | if (unlikely(pps_fbase.tv_sec == 0)) { |
| 915 | pps_fbase = *raw_ts; | 927 | pps_fbase = *raw_ts; |
| 916 | spin_unlock_irqrestore(&ntp_lock, flags); | 928 | raw_spin_unlock_irqrestore(&ntp_lock, flags); |
| 917 | return; | 929 | return; |
| 918 | } | 930 | } |
| 919 | 931 | ||
| @@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
| 928 | time_status |= STA_PPSJITTER; | 940 | time_status |= STA_PPSJITTER; |
| 929 | /* restart the frequency calibration interval */ | 941 | /* restart the frequency calibration interval */ |
| 930 | pps_fbase = *raw_ts; | 942 | pps_fbase = *raw_ts; |
| 931 | spin_unlock_irqrestore(&ntp_lock, flags); | 943 | raw_spin_unlock_irqrestore(&ntp_lock, flags); |
| 932 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | 944 | pr_err("hardpps: PPSJITTER: bad pulse\n"); |
| 933 | return; | 945 | return; |
| 934 | } | 946 | } |
| @@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
| 945 | 957 | ||
| 946 | hardpps_update_phase(pts_norm.nsec); | 958 | hardpps_update_phase(pts_norm.nsec); |
| 947 | 959 | ||
| 948 | spin_unlock_irqrestore(&ntp_lock, flags); | 960 | raw_spin_unlock_irqrestore(&ntp_lock, flags); |
| 949 | } | 961 | } |
| 950 | EXPORT_SYMBOL(hardpps); | 962 | EXPORT_SYMBOL(hardpps); |
| 951 | 963 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f113755695e2..2fb8cb88df8d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 21 | #include <linux/smp.h> | ||
| 21 | 22 | ||
| 22 | #include "tick-internal.h" | 23 | #include "tick-internal.h" |
| 23 | 24 | ||
| @@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev) | |||
| 86 | return (dev && tick_broadcast_device.evtdev == dev); | 87 | return (dev && tick_broadcast_device.evtdev == dev); |
| 87 | } | 88 | } |
| 88 | 89 | ||
| 90 | static void err_broadcast(const struct cpumask *mask) | ||
| 91 | { | ||
| 92 | pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); | ||
| 93 | } | ||
| 94 | |||
| 95 | static void tick_device_setup_broadcast_func(struct clock_event_device *dev) | ||
| 96 | { | ||
| 97 | if (!dev->broadcast) | ||
| 98 | dev->broadcast = tick_broadcast; | ||
| 99 | if (!dev->broadcast) { | ||
| 100 | pr_warn_once("%s depends on broadcast, but no broadcast function available\n", | ||
| 101 | dev->name); | ||
| 102 | dev->broadcast = err_broadcast; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 89 | /* | 106 | /* |
| 90 | * Check, if the device is disfunctional and a place holder, which | 107 | * Check, if the device is disfunctional and a place holder, which |
| 91 | * needs to be handled by the broadcast device. | 108 | * needs to be handled by the broadcast device. |
| @@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
| 105 | */ | 122 | */ |
| 106 | if (!tick_device_is_functional(dev)) { | 123 | if (!tick_device_is_functional(dev)) { |
| 107 | dev->event_handler = tick_handle_periodic; | 124 | dev->event_handler = tick_handle_periodic; |
| 125 | tick_device_setup_broadcast_func(dev); | ||
| 108 | cpumask_set_cpu(cpu, tick_get_broadcast_mask()); | 126 | cpumask_set_cpu(cpu, tick_get_broadcast_mask()); |
| 109 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | 127 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); |
| 110 | ret = 1; | 128 | ret = 1; |
| @@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
| 116 | */ | 134 | */ |
| 117 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { | 135 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { |
| 118 | int cpu = smp_processor_id(); | 136 | int cpu = smp_processor_id(); |
| 119 | |||
| 120 | cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); | 137 | cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); |
| 121 | tick_broadcast_clear_oneshot(cpu); | 138 | tick_broadcast_clear_oneshot(cpu); |
| 139 | } else { | ||
| 140 | tick_device_setup_broadcast_func(dev); | ||
| 122 | } | 141 | } |
| 123 | } | 142 | } |
| 124 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 143 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
| 125 | return ret; | 144 | return ret; |
| 126 | } | 145 | } |
| 127 | 146 | ||
| 147 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
| 148 | int tick_receive_broadcast(void) | ||
| 149 | { | ||
| 150 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | ||
| 151 | struct clock_event_device *evt = td->evtdev; | ||
| 152 | |||
| 153 | if (!evt) | ||
| 154 | return -ENODEV; | ||
| 155 | |||
| 156 | if (!evt->event_handler) | ||
| 157 | return -EINVAL; | ||
| 158 | |||
| 159 | evt->event_handler(evt); | ||
| 160 | return 0; | ||
| 161 | } | ||
| 162 | #endif | ||
| 163 | |||
| 128 | /* | 164 | /* |
| 129 | * Broadcast the event to the cpus, which are set in the mask (mangled). | 165 | * Broadcast the event to the cpus, which are set in the mask (mangled). |
| 130 | */ | 166 | */ |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d58e552d9fd1..314b9ee07edf 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/profile.h> | 20 | #include <linux/profile.h> |
| 21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
| 22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 23 | #include <linux/irq_work.h> | ||
| 23 | 24 | ||
| 24 | #include <asm/irq_regs.h> | 25 | #include <asm/irq_regs.h> |
| 25 | 26 | ||
| @@ -28,7 +29,7 @@ | |||
| 28 | /* | 29 | /* |
| 29 | * Per cpu nohz control structure | 30 | * Per cpu nohz control structure |
| 30 | */ | 31 | */ |
| 31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 32 | DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
| 32 | 33 | ||
| 33 | /* | 34 | /* |
| 34 | * The time, when the last jiffy update happened. Protected by jiffies_lock. | 35 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
| @@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 331 | time_delta = timekeeping_max_deferment(); | 332 | time_delta = timekeeping_max_deferment(); |
| 332 | } while (read_seqretry(&jiffies_lock, seq)); | 333 | } while (read_seqretry(&jiffies_lock, seq)); |
| 333 | 334 | ||
| 334 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || | 335 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || |
| 335 | arch_needs_cpu(cpu)) { | 336 | arch_needs_cpu(cpu) || irq_work_needs_cpu()) { |
| 336 | next_jiffies = last_jiffies + 1; | 337 | next_jiffies = last_jiffies + 1; |
| 337 | delta_jiffies = 1; | 338 | delta_jiffies = 1; |
| 338 | } else { | 339 | } else { |
| @@ -631,8 +632,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
| 631 | 632 | ||
| 632 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | 633 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) |
| 633 | { | 634 | { |
| 634 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 635 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
| 635 | unsigned long ticks; | 636 | unsigned long ticks; |
| 637 | |||
| 638 | if (vtime_accounting_enabled()) | ||
| 639 | return; | ||
| 636 | /* | 640 | /* |
| 637 | * We stopped the tick in idle. Update process times would miss the | 641 | * We stopped the tick in idle. Update process times would miss the |
| 638 | * time we slept as update_process_times does only a 1 tick | 642 | * time we slept as update_process_times does only a 1 tick |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbc6acb0db3f..9a0bc98fbe1d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -29,6 +29,9 @@ static struct timekeeper timekeeper; | |||
| 29 | /* flag for if timekeeping is suspended */ | 29 | /* flag for if timekeeping is suspended */ |
| 30 | int __read_mostly timekeeping_suspended; | 30 | int __read_mostly timekeeping_suspended; |
| 31 | 31 | ||
| 32 | /* Flag for if there is a persistent clock on this platform */ | ||
| 33 | bool __read_mostly persistent_clock_exist = false; | ||
| 34 | |||
| 32 | static inline void tk_normalize_xtime(struct timekeeper *tk) | 35 | static inline void tk_normalize_xtime(struct timekeeper *tk) |
| 33 | { | 36 | { |
| 34 | while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { | 37 | while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { |
| @@ -135,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
| 135 | } | 138 | } |
| 136 | 139 | ||
| 137 | /* Timekeeper helper functions. */ | 140 | /* Timekeeper helper functions. */ |
| 141 | |||
| 142 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | ||
| 143 | u32 (*arch_gettimeoffset)(void); | ||
| 144 | |||
| 145 | u32 get_arch_timeoffset(void) | ||
| 146 | { | ||
| 147 | if (likely(arch_gettimeoffset)) | ||
| 148 | return arch_gettimeoffset(); | ||
| 149 | return 0; | ||
| 150 | } | ||
| 151 | #else | ||
| 152 | static inline u32 get_arch_timeoffset(void) { return 0; } | ||
| 153 | #endif | ||
| 154 | |||
| 138 | static inline s64 timekeeping_get_ns(struct timekeeper *tk) | 155 | static inline s64 timekeeping_get_ns(struct timekeeper *tk) |
| 139 | { | 156 | { |
| 140 | cycle_t cycle_now, cycle_delta; | 157 | cycle_t cycle_now, cycle_delta; |
| @@ -151,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) | |||
| 151 | nsec = cycle_delta * tk->mult + tk->xtime_nsec; | 168 | nsec = cycle_delta * tk->mult + tk->xtime_nsec; |
| 152 | nsec >>= tk->shift; | 169 | nsec >>= tk->shift; |
| 153 | 170 | ||
| 154 | /* If arch requires, add in gettimeoffset() */ | 171 | /* If arch requires, add in get_arch_timeoffset() */ |
| 155 | return nsec + arch_gettimeoffset(); | 172 | return nsec + get_arch_timeoffset(); |
| 156 | } | 173 | } |
| 157 | 174 | ||
| 158 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | 175 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) |
| @@ -171,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
| 171 | /* convert delta to nanoseconds. */ | 188 | /* convert delta to nanoseconds. */ |
| 172 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 189 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
| 173 | 190 | ||
| 174 | /* If arch requires, add in gettimeoffset() */ | 191 | /* If arch requires, add in get_arch_timeoffset() */ |
| 175 | return nsec + arch_gettimeoffset(); | 192 | return nsec + get_arch_timeoffset(); |
| 176 | } | 193 | } |
| 177 | 194 | ||
| 178 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); | 195 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); |
| @@ -254,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) | |||
| 254 | 271 | ||
| 255 | tk->xtime_nsec += cycle_delta * tk->mult; | 272 | tk->xtime_nsec += cycle_delta * tk->mult; |
| 256 | 273 | ||
| 257 | /* If arch requires, add in gettimeoffset() */ | 274 | /* If arch requires, add in get_arch_timeoffset() */ |
| 258 | tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; | 275 | tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; |
| 259 | 276 | ||
| 260 | tk_normalize_xtime(tk); | 277 | tk_normalize_xtime(tk); |
| 261 | 278 | ||
| @@ -264,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk) | |||
| 264 | } | 281 | } |
| 265 | 282 | ||
| 266 | /** | 283 | /** |
| 267 | * getnstimeofday - Returns the time of day in a timespec | 284 | * __getnstimeofday - Returns the time of day in a timespec. |
| 268 | * @ts: pointer to the timespec to be set | 285 | * @ts: pointer to the timespec to be set |
| 269 | * | 286 | * |
| 270 | * Returns the time of day in a timespec. | 287 | * Updates the time of day in the timespec. |
| 288 | * Returns 0 on success, or -ve when suspended (timespec will be undefined). | ||
| 271 | */ | 289 | */ |
| 272 | void getnstimeofday(struct timespec *ts) | 290 | int __getnstimeofday(struct timespec *ts) |
| 273 | { | 291 | { |
| 274 | struct timekeeper *tk = &timekeeper; | 292 | struct timekeeper *tk = &timekeeper; |
| 275 | unsigned long seq; | 293 | unsigned long seq; |
| 276 | s64 nsecs = 0; | 294 | s64 nsecs = 0; |
| 277 | 295 | ||
| 278 | WARN_ON(timekeeping_suspended); | ||
| 279 | |||
| 280 | do { | 296 | do { |
| 281 | seq = read_seqbegin(&tk->lock); | 297 | seq = read_seqbegin(&tk->lock); |
| 282 | 298 | ||
| @@ -287,6 +303,26 @@ void getnstimeofday(struct timespec *ts) | |||
| 287 | 303 | ||
| 288 | ts->tv_nsec = 0; | 304 | ts->tv_nsec = 0; |
| 289 | timespec_add_ns(ts, nsecs); | 305 | timespec_add_ns(ts, nsecs); |
| 306 | |||
| 307 | /* | ||
| 308 | * Do not bail out early, in case there were callers still using | ||
| 309 | * the value, even in the face of the WARN_ON. | ||
| 310 | */ | ||
| 311 | if (unlikely(timekeeping_suspended)) | ||
| 312 | return -EAGAIN; | ||
| 313 | return 0; | ||
| 314 | } | ||
| 315 | EXPORT_SYMBOL(__getnstimeofday); | ||
| 316 | |||
| 317 | /** | ||
| 318 | * getnstimeofday - Returns the time of day in a timespec. | ||
| 319 | * @ts: pointer to the timespec to be set | ||
| 320 | * | ||
| 321 | * Returns the time of day in a timespec (WARN if suspended). | ||
| 322 | */ | ||
| 323 | void getnstimeofday(struct timespec *ts) | ||
| 324 | { | ||
| 325 | WARN_ON(__getnstimeofday(ts)); | ||
| 290 | } | 326 | } |
| 291 | EXPORT_SYMBOL(getnstimeofday); | 327 | EXPORT_SYMBOL(getnstimeofday); |
| 292 | 328 | ||
| @@ -640,12 +676,14 @@ void __init timekeeping_init(void) | |||
| 640 | struct timespec now, boot, tmp; | 676 | struct timespec now, boot, tmp; |
| 641 | 677 | ||
| 642 | read_persistent_clock(&now); | 678 | read_persistent_clock(&now); |
| 679 | |||
| 643 | if (!timespec_valid_strict(&now)) { | 680 | if (!timespec_valid_strict(&now)) { |
| 644 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | 681 | pr_warn("WARNING: Persistent clock returned invalid value!\n" |
| 645 | " Check your CMOS/BIOS settings.\n"); | 682 | " Check your CMOS/BIOS settings.\n"); |
| 646 | now.tv_sec = 0; | 683 | now.tv_sec = 0; |
| 647 | now.tv_nsec = 0; | 684 | now.tv_nsec = 0; |
| 648 | } | 685 | } else if (now.tv_sec || now.tv_nsec) |
| 686 | persistent_clock_exist = true; | ||
| 649 | 687 | ||
| 650 | read_boot_clock(&boot); | 688 | read_boot_clock(&boot); |
| 651 | if (!timespec_valid_strict(&boot)) { | 689 | if (!timespec_valid_strict(&boot)) { |
| @@ -718,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
| 718 | { | 756 | { |
| 719 | struct timekeeper *tk = &timekeeper; | 757 | struct timekeeper *tk = &timekeeper; |
| 720 | unsigned long flags; | 758 | unsigned long flags; |
| 721 | struct timespec ts; | ||
| 722 | 759 | ||
| 723 | /* Make sure we don't set the clock twice */ | 760 | /* |
| 724 | read_persistent_clock(&ts); | 761 | * Make sure we don't set the clock twice, as timekeeping_resume() |
| 725 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | 762 | * already did it |
| 763 | */ | ||
| 764 | if (has_persistent_clock()) | ||
| 726 | return; | 765 | return; |
| 727 | 766 | ||
| 728 | write_seqlock_irqsave(&tk->lock, flags); | 767 | write_seqlock_irqsave(&tk->lock, flags); |
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc new file mode 100644 index 000000000000..511bdf2cafda --- /dev/null +++ b/kernel/timeconst.bc | |||
| @@ -0,0 +1,108 @@ | |||
| 1 | scale=0 | ||
| 2 | |||
| 3 | define gcd(a,b) { | ||
| 4 | auto t; | ||
| 5 | while (b) { | ||
| 6 | t = b; | ||
| 7 | b = a % b; | ||
| 8 | a = t; | ||
| 9 | } | ||
| 10 | return a; | ||
| 11 | } | ||
| 12 | |||
| 13 | /* Division by reciprocal multiplication. */ | ||
| 14 | define fmul(b,n,d) { | ||
| 15 | return (2^b*n+d-1)/d; | ||
| 16 | } | ||
| 17 | |||
| 18 | /* Adjustment factor when a ceiling value is used. Use as: | ||
| 19 | (imul * n) + (fmulxx * n + fadjxx) >> xx) */ | ||
| 20 | define fadj(b,n,d) { | ||
| 21 | auto v; | ||
| 22 | d = d/gcd(n,d); | ||
| 23 | v = 2^b*(d-1)/d; | ||
| 24 | return v; | ||
| 25 | } | ||
| 26 | |||
| 27 | /* Compute the appropriate mul/adj values as well as a shift count, | ||
| 28 | which brings the mul value into the range 2^b-1 <= x < 2^b. Such | ||
| 29 | a shift value will be correct in the signed integer range and off | ||
| 30 | by at most one in the upper half of the unsigned range. */ | ||
| 31 | define fmuls(b,n,d) { | ||
| 32 | auto s, m; | ||
| 33 | for (s = 0; 1; s++) { | ||
| 34 | m = fmul(s,n,d); | ||
| 35 | if (m >= 2^(b-1)) | ||
| 36 | return s; | ||
| 37 | } | ||
| 38 | return 0; | ||
| 39 | } | ||
| 40 | |||
| 41 | define timeconst(hz) { | ||
| 42 | print "/* Automatically generated by kernel/timeconst.bc */\n" | ||
| 43 | print "/* Time conversion constants for HZ == ", hz, " */\n" | ||
| 44 | print "\n" | ||
| 45 | |||
| 46 | print "#ifndef KERNEL_TIMECONST_H\n" | ||
| 47 | print "#define KERNEL_TIMECONST_H\n\n" | ||
| 48 | |||
| 49 | print "#include <linux/param.h>\n" | ||
| 50 | print "#include <linux/types.h>\n\n" | ||
| 51 | |||
| 52 | print "#if HZ != ", hz, "\n" | ||
| 53 | print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" | ||
| 54 | print "#endif\n\n" | ||
| 55 | |||
| 56 | if (hz < 2) { | ||
| 57 | print "#error Totally bogus HZ value!\n" | ||
| 58 | } else { | ||
| 59 | s=fmuls(32,1000,hz) | ||
| 60 | obase=16 | ||
| 61 | print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" | ||
| 62 | print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" | ||
| 63 | obase=10 | ||
| 64 | print "#define HZ_TO_MSEC_SHR32\t", s, "\n" | ||
| 65 | |||
| 66 | s=fmuls(32,hz,1000) | ||
| 67 | obase=16 | ||
| 68 | print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" | ||
| 69 | print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" | ||
| 70 | obase=10 | ||
| 71 | print "#define MSEC_TO_HZ_SHR32\t", s, "\n" | ||
| 72 | |||
| 73 | obase=10 | ||
| 74 | cd=gcd(hz,1000) | ||
| 75 | print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" | ||
| 76 | print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" | ||
| 77 | print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" | ||
| 78 | print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" | ||
| 79 | print "\n" | ||
| 80 | |||
| 81 | s=fmuls(32,1000000,hz) | ||
| 82 | obase=16 | ||
| 83 | print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" | ||
| 84 | print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" | ||
| 85 | obase=10 | ||
| 86 | print "#define HZ_TO_USEC_SHR32\t", s, "\n" | ||
| 87 | |||
| 88 | s=fmuls(32,hz,1000000) | ||
| 89 | obase=16 | ||
| 90 | print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" | ||
| 91 | print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" | ||
| 92 | obase=10 | ||
| 93 | print "#define USEC_TO_HZ_SHR32\t", s, "\n" | ||
| 94 | |||
| 95 | obase=10 | ||
| 96 | cd=gcd(hz,1000000) | ||
| 97 | print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" | ||
| 98 | print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" | ||
| 99 | print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" | ||
| 100 | print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" | ||
| 101 | print "\n" | ||
| 102 | |||
| 103 | print "#endif /* KERNEL_TIMECONST_H */\n" | ||
| 104 | } | ||
| 105 | halt | ||
| 106 | } | ||
| 107 | |||
| 108 | timeconst(hz) | ||
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl deleted file mode 100644 index eb51d76e058a..000000000000 --- a/kernel/timeconst.pl +++ /dev/null | |||
| @@ -1,378 +0,0 @@ | |||
| 1 | #!/usr/bin/perl | ||
| 2 | # ----------------------------------------------------------------------- | ||
| 3 | # | ||
| 4 | # Copyright 2007-2008 rPath, Inc. - All Rights Reserved | ||
| 5 | # | ||
| 6 | # This file is part of the Linux kernel, and is made available under | ||
| 7 | # the terms of the GNU General Public License version 2 or (at your | ||
| 8 | # option) any later version; incorporated herein by reference. | ||
| 9 | # | ||
| 10 | # ----------------------------------------------------------------------- | ||
| 11 | # | ||
| 12 | |||
| 13 | # | ||
| 14 | # Usage: timeconst.pl HZ > timeconst.h | ||
| 15 | # | ||
| 16 | |||
| 17 | # Precomputed values for systems without Math::BigInt | ||
| 18 | # Generated by: | ||
| 19 | # timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 | ||
| 20 | %canned_values = ( | ||
| 21 | 24 => [ | ||
| 22 | '0xa6aaaaab','0x2aaaaaa',26, | ||
| 23 | 125,3, | ||
| 24 | '0xc49ba5e4','0x1fbe76c8b4',37, | ||
| 25 | 3,125, | ||
| 26 | '0xa2c2aaab','0xaaaa',16, | ||
| 27 | 125000,3, | ||
| 28 | '0xc9539b89','0x7fffbce4217d',47, | ||
| 29 | 3,125000, | ||
| 30 | ], 32 => [ | ||
| 31 | '0xfa000000','0x6000000',27, | ||
| 32 | 125,4, | ||
| 33 | '0x83126e98','0xfdf3b645a',36, | ||
| 34 | 4,125, | ||
| 35 | '0xf4240000','0x0',17, | ||
| 36 | 31250,1, | ||
| 37 | '0x8637bd06','0x3fff79c842fa',46, | ||
| 38 | 1,31250, | ||
| 39 | ], 48 => [ | ||
| 40 | '0xa6aaaaab','0x6aaaaaa',27, | ||
| 41 | 125,6, | ||
| 42 | '0xc49ba5e4','0xfdf3b645a',36, | ||
| 43 | 6,125, | ||
| 44 | '0xa2c2aaab','0x15555',17, | ||
| 45 | 62500,3, | ||
| 46 | '0xc9539b89','0x3fffbce4217d',46, | ||
| 47 | 3,62500, | ||
| 48 | ], 64 => [ | ||
| 49 | '0xfa000000','0xe000000',28, | ||
| 50 | 125,8, | ||
| 51 | '0x83126e98','0x7ef9db22d',35, | ||
| 52 | 8,125, | ||
| 53 | '0xf4240000','0x0',18, | ||
| 54 | 15625,1, | ||
| 55 | '0x8637bd06','0x1fff79c842fa',45, | ||
| 56 | 1,15625, | ||
| 57 | ], 100 => [ | ||
| 58 | '0xa0000000','0x0',28, | ||
| 59 | 10,1, | ||
| 60 | '0xcccccccd','0x733333333',35, | ||
| 61 | 1,10, | ||
| 62 | '0x9c400000','0x0',18, | ||
| 63 | 10000,1, | ||
| 64 | '0xd1b71759','0x1fff2e48e8a7',45, | ||
| 65 | 1,10000, | ||
| 66 | ], 122 => [ | ||
| 67 | '0x8325c53f','0xfbcda3a',28, | ||
| 68 | 500,61, | ||
| 69 | '0xf9db22d1','0x7fbe76c8b',35, | ||
| 70 | 61,500, | ||
| 71 | '0x8012e2a0','0x3ef36',18, | ||
| 72 | 500000,61, | ||
| 73 | '0xffda4053','0x1ffffbce4217',45, | ||
| 74 | 61,500000, | ||
| 75 | ], 128 => [ | ||
| 76 | '0xfa000000','0x1e000000',29, | ||
| 77 | 125,16, | ||
| 78 | '0x83126e98','0x3f7ced916',34, | ||
| 79 | 16,125, | ||
| 80 | '0xf4240000','0x40000',19, | ||
| 81 | 15625,2, | ||
| 82 | '0x8637bd06','0xfffbce4217d',44, | ||
| 83 | 2,15625, | ||
| 84 | ], 200 => [ | ||
| 85 | '0xa0000000','0x0',29, | ||
| 86 | 5,1, | ||
| 87 | '0xcccccccd','0x333333333',34, | ||
| 88 | 1,5, | ||
| 89 | '0x9c400000','0x0',19, | ||
| 90 | 5000,1, | ||
| 91 | '0xd1b71759','0xfff2e48e8a7',44, | ||
| 92 | 1,5000, | ||
| 93 | ], 250 => [ | ||
| 94 | '0x80000000','0x0',29, | ||
| 95 | 4,1, | ||
| 96 | '0x80000000','0x180000000',33, | ||
| 97 | 1,4, | ||
| 98 | '0xfa000000','0x0',20, | ||
| 99 | 4000,1, | ||
| 100 | '0x83126e98','0x7ff7ced9168',43, | ||
| 101 | 1,4000, | ||
| 102 | ], 256 => [ | ||
| 103 | '0xfa000000','0x3e000000',30, | ||
| 104 | 125,32, | ||
| 105 | '0x83126e98','0x1fbe76c8b',33, | ||
| 106 | 32,125, | ||
| 107 | '0xf4240000','0xc0000',20, | ||
| 108 | 15625,4, | ||
| 109 | '0x8637bd06','0x7ffde7210be',43, | ||
| 110 | 4,15625, | ||
| 111 | ], 300 => [ | ||
| 112 | '0xd5555556','0x2aaaaaaa',30, | ||
| 113 | 10,3, | ||
| 114 | '0x9999999a','0x1cccccccc',33, | ||
| 115 | 3,10, | ||
| 116 | '0xd0555556','0xaaaaa',20, | ||
| 117 | 10000,3, | ||
| 118 | '0x9d495183','0x7ffcb923a29',43, | ||
| 119 | 3,10000, | ||
| 120 | ], 512 => [ | ||
| 121 | '0xfa000000','0x7e000000',31, | ||
| 122 | 125,64, | ||
| 123 | '0x83126e98','0xfdf3b645',32, | ||
| 124 | 64,125, | ||
| 125 | '0xf4240000','0x1c0000',21, | ||
| 126 | 15625,8, | ||
| 127 | '0x8637bd06','0x3ffef39085f',42, | ||
| 128 | 8,15625, | ||
| 129 | ], 1000 => [ | ||
| 130 | '0x80000000','0x0',31, | ||
| 131 | 1,1, | ||
| 132 | '0x80000000','0x0',31, | ||
| 133 | 1,1, | ||
| 134 | '0xfa000000','0x0',22, | ||
| 135 | 1000,1, | ||
| 136 | '0x83126e98','0x1ff7ced9168',41, | ||
| 137 | 1,1000, | ||
| 138 | ], 1024 => [ | ||
| 139 | '0xfa000000','0xfe000000',32, | ||
| 140 | 125,128, | ||
| 141 | '0x83126e98','0x7ef9db22',31, | ||
| 142 | 128,125, | ||
| 143 | '0xf4240000','0x3c0000',22, | ||
| 144 | 15625,16, | ||
| 145 | '0x8637bd06','0x1fff79c842f',41, | ||
| 146 | 16,15625, | ||
| 147 | ], 1200 => [ | ||
| 148 | '0xd5555556','0xd5555555',32, | ||
| 149 | 5,6, | ||
| 150 | '0x9999999a','0x66666666',31, | ||
| 151 | 6,5, | ||
| 152 | '0xd0555556','0x2aaaaa',22, | ||
| 153 | 2500,3, | ||
| 154 | '0x9d495183','0x1ffcb923a29',41, | ||
| 155 | 3,2500, | ||
| 156 | ] | ||
| 157 | ); | ||
| 158 | |||
| 159 | $has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; | ||
| 160 | |||
| 161 | sub bint($) | ||
| 162 | { | ||
| 163 | my($x) = @_; | ||
| 164 | return Math::BigInt->new($x); | ||
| 165 | } | ||
| 166 | |||
| 167 | # | ||
| 168 | # Constants for division by reciprocal multiplication. | ||
| 169 | # (bits, numerator, denominator) | ||
| 170 | # | ||
| 171 | sub fmul($$$) | ||
| 172 | { | ||
| 173 | my ($b,$n,$d) = @_; | ||
| 174 | |||
| 175 | $n = bint($n); | ||
| 176 | $d = bint($d); | ||
| 177 | |||
| 178 | return scalar (($n << $b)+$d-bint(1))/$d; | ||
| 179 | } | ||
| 180 | |||
| 181 | sub fadj($$$) | ||
| 182 | { | ||
| 183 | my($b,$n,$d) = @_; | ||
| 184 | |||
| 185 | $n = bint($n); | ||
| 186 | $d = bint($d); | ||
| 187 | |||
| 188 | $d = $d/bgcd($n, $d); | ||
| 189 | return scalar (($d-bint(1)) << $b)/$d; | ||
| 190 | } | ||
| 191 | |||
| 192 | sub fmuls($$$) { | ||
| 193 | my($b,$n,$d) = @_; | ||
| 194 | my($s,$m); | ||
| 195 | my($thres) = bint(1) << ($b-1); | ||
| 196 | |||
| 197 | $n = bint($n); | ||
| 198 | $d = bint($d); | ||
| 199 | |||
| 200 | for ($s = 0; 1; $s++) { | ||
| 201 | $m = fmul($s,$n,$d); | ||
| 202 | return $s if ($m >= $thres); | ||
| 203 | } | ||
| 204 | return 0; | ||
| 205 | } | ||
| 206 | |||
| 207 | # Generate a hex value if the result fits in 64 bits; | ||
| 208 | # otherwise skip. | ||
| 209 | sub bignum_hex($) { | ||
| 210 | my($x) = @_; | ||
| 211 | my $s = $x->as_hex(); | ||
| 212 | |||
| 213 | return (length($s) > 18) ? undef : $s; | ||
| 214 | } | ||
| 215 | |||
| 216 | # Provides mul, adj, and shr factors for a specific | ||
| 217 | # (bit, time, hz) combination | ||
| 218 | sub muladj($$$) { | ||
| 219 | my($b, $t, $hz) = @_; | ||
| 220 | my $s = fmuls($b, $t, $hz); | ||
| 221 | my $m = fmul($s, $t, $hz); | ||
| 222 | my $a = fadj($s, $t, $hz); | ||
| 223 | return (bignum_hex($m), bignum_hex($a), $s); | ||
| 224 | } | ||
| 225 | |||
| 226 | # Provides numerator, denominator values | ||
| 227 | sub numden($$) { | ||
| 228 | my($n, $d) = @_; | ||
| 229 | my $g = bgcd($n, $d); | ||
| 230 | return ($n/$g, $d/$g); | ||
| 231 | } | ||
| 232 | |||
| 233 | # All values for a specific (time, hz) combo | ||
| 234 | sub conversions($$) { | ||
| 235 | my ($t, $hz) = @_; | ||
| 236 | my @val = (); | ||
| 237 | |||
| 238 | # HZ_TO_xx | ||
| 239 | push(@val, muladj(32, $t, $hz)); | ||
| 240 | push(@val, numden($t, $hz)); | ||
| 241 | |||
| 242 | # xx_TO_HZ | ||
| 243 | push(@val, muladj(32, $hz, $t)); | ||
| 244 | push(@val, numden($hz, $t)); | ||
| 245 | |||
| 246 | return @val; | ||
| 247 | } | ||
| 248 | |||
| 249 | sub compute_values($) { | ||
| 250 | my($hz) = @_; | ||
| 251 | my @val = (); | ||
| 252 | my $s, $m, $a, $g; | ||
| 253 | |||
| 254 | if (!$has_bigint) { | ||
| 255 | die "$0: HZ == $hz not canned and ". | ||
| 256 | "Math::BigInt not available\n"; | ||
| 257 | } | ||
| 258 | |||
| 259 | # MSEC conversions | ||
| 260 | push(@val, conversions(1000, $hz)); | ||
| 261 | |||
| 262 | # USEC conversions | ||
| 263 | push(@val, conversions(1000000, $hz)); | ||
| 264 | |||
| 265 | return @val; | ||
| 266 | } | ||
| 267 | |||
| 268 | sub outputval($$) | ||
| 269 | { | ||
| 270 | my($name, $val) = @_; | ||
| 271 | my $csuf; | ||
| 272 | |||
| 273 | if (defined($val)) { | ||
| 274 | if ($name !~ /SHR/) { | ||
| 275 | $val = "U64_C($val)"; | ||
| 276 | } | ||
| 277 | printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; | ||
| 278 | } | ||
| 279 | } | ||
| 280 | |||
| 281 | sub output($@) | ||
| 282 | { | ||
| 283 | my($hz, @val) = @_; | ||
| 284 | my $pfx, $bit, $suf, $s, $m, $a; | ||
| 285 | |||
| 286 | print "/* Automatically generated by kernel/timeconst.pl */\n"; | ||
| 287 | print "/* Conversion constants for HZ == $hz */\n"; | ||
| 288 | print "\n"; | ||
| 289 | print "#ifndef KERNEL_TIMECONST_H\n"; | ||
| 290 | print "#define KERNEL_TIMECONST_H\n"; | ||
| 291 | print "\n"; | ||
| 292 | |||
| 293 | print "#include <linux/param.h>\n"; | ||
| 294 | print "#include <linux/types.h>\n"; | ||
| 295 | |||
| 296 | print "\n"; | ||
| 297 | print "#if HZ != $hz\n"; | ||
| 298 | print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; | ||
| 299 | print "#endif\n"; | ||
| 300 | print "\n"; | ||
| 301 | |||
| 302 | foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', | ||
| 303 | 'HZ_TO_USEC','USEC_TO_HZ') { | ||
| 304 | foreach $bit (32) { | ||
| 305 | foreach $suf ('MUL', 'ADJ', 'SHR') { | ||
| 306 | outputval("${pfx}_$suf$bit", shift(@val)); | ||
| 307 | } | ||
| 308 | } | ||
| 309 | foreach $suf ('NUM', 'DEN') { | ||
| 310 | outputval("${pfx}_$suf", shift(@val)); | ||
| 311 | } | ||
| 312 | } | ||
| 313 | |||
| 314 | print "\n"; | ||
| 315 | print "#endif /* KERNEL_TIMECONST_H */\n"; | ||
| 316 | } | ||
| 317 | |||
| 318 | # Pretty-print Perl values | ||
| 319 | sub perlvals(@) { | ||
| 320 | my $v; | ||
| 321 | my @l = (); | ||
| 322 | |||
| 323 | foreach $v (@_) { | ||
| 324 | if (!defined($v)) { | ||
| 325 | push(@l, 'undef'); | ||
| 326 | } elsif ($v =~ /^0x/) { | ||
| 327 | push(@l, "\'".$v."\'"); | ||
| 328 | } else { | ||
| 329 | push(@l, $v.''); | ||
| 330 | } | ||
| 331 | } | ||
| 332 | return join(',', @l); | ||
| 333 | } | ||
| 334 | |||
| 335 | ($hz) = @ARGV; | ||
| 336 | |||
| 337 | # Use this to generate the %canned_values structure | ||
| 338 | if ($hz eq '--can') { | ||
| 339 | shift(@ARGV); | ||
| 340 | @hzlist = sort {$a <=> $b} (@ARGV); | ||
| 341 | |||
| 342 | print "# Precomputed values for systems without Math::BigInt\n"; | ||
| 343 | print "# Generated by:\n"; | ||
| 344 | print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; | ||
| 345 | print "\%canned_values = (\n"; | ||
| 346 | my $pf = "\t"; | ||
| 347 | foreach $hz (@hzlist) { | ||
| 348 | my @values = compute_values($hz); | ||
| 349 | print "$pf$hz => [\n"; | ||
| 350 | while (scalar(@values)) { | ||
| 351 | my $bit; | ||
| 352 | foreach $bit (32) { | ||
| 353 | my $m = shift(@values); | ||
| 354 | my $a = shift(@values); | ||
| 355 | my $s = shift(@values); | ||
| 356 | print "\t\t", perlvals($m,$a,$s), ",\n"; | ||
| 357 | } | ||
| 358 | my $n = shift(@values); | ||
| 359 | my $d = shift(@values); | ||
| 360 | print "\t\t", perlvals($n,$d), ",\n"; | ||
| 361 | } | ||
| 362 | print "\t]"; | ||
| 363 | $pf = ', '; | ||
| 364 | } | ||
| 365 | print "\n);\n"; | ||
| 366 | } else { | ||
| 367 | $hz += 0; # Force to number | ||
| 368 | if ($hz < 1) { | ||
| 369 | die "Usage: $0 HZ\n"; | ||
| 370 | } | ||
| 371 | |||
| 372 | @val = @{$canned_values{$hz}}; | ||
| 373 | if (!defined(@val)) { | ||
| 374 | @val = compute_values($hz); | ||
| 375 | } | ||
| 376 | output($hz, @val); | ||
| 377 | } | ||
| 378 | exit 0; | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 367d00858482..dbf7a78a1ef1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
| 40 | #include <linux/irq_work.h> | 40 | #include <linux/irq_work.h> |
| 41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
| 42 | #include <linux/sched/sysctl.h> | ||
| 42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
| 43 | 44 | ||
| 44 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
| @@ -1351,7 +1352,6 @@ void update_process_times(int user_tick) | |||
| 1351 | account_process_tick(p, user_tick); | 1352 | account_process_tick(p, user_tick); |
| 1352 | run_local_timers(); | 1353 | run_local_timers(); |
| 1353 | rcu_check_callbacks(cpu, user_tick); | 1354 | rcu_check_callbacks(cpu, user_tick); |
| 1354 | printk_tick(); | ||
| 1355 | #ifdef CONFIG_IRQ_WORK | 1355 | #ifdef CONFIG_IRQ_WORK |
| 1356 | if (in_irq()) | 1356 | if (in_irq()) |
| 1357 | irq_work_run(); | 1357 | irq_work_run(); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485f..192473b22799 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE | |||
| 39 | help | 39 | help |
| 40 | See Documentation/trace/ftrace-design.txt | 40 | See Documentation/trace/ftrace-design.txt |
| 41 | 41 | ||
| 42 | config HAVE_DYNAMIC_FTRACE_WITH_REGS | ||
| 43 | bool | ||
| 44 | |||
| 42 | config HAVE_FTRACE_MCOUNT_RECORD | 45 | config HAVE_FTRACE_MCOUNT_RECORD |
| 43 | bool | 46 | bool |
| 44 | help | 47 | help |
| @@ -78,21 +81,6 @@ config EVENT_TRACING | |||
| 78 | select CONTEXT_SWITCH_TRACER | 81 | select CONTEXT_SWITCH_TRACER |
| 79 | bool | 82 | bool |
| 80 | 83 | ||
| 81 | config EVENT_POWER_TRACING_DEPRECATED | ||
| 82 | depends on EVENT_TRACING | ||
| 83 | bool "Deprecated power event trace API, to be removed" | ||
| 84 | default y | ||
| 85 | help | ||
| 86 | Provides old power event types: | ||
| 87 | C-state/idle accounting events: | ||
| 88 | power:power_start | ||
| 89 | power:power_end | ||
| 90 | and old cpufreq accounting event: | ||
| 91 | power:power_frequency | ||
| 92 | This is for userspace compatibility | ||
| 93 | and will vanish after 5 kernel iterations, | ||
| 94 | namely 3.1. | ||
| 95 | |||
| 96 | config CONTEXT_SWITCH_TRACER | 84 | config CONTEXT_SWITCH_TRACER |
| 97 | bool | 85 | bool |
| 98 | 86 | ||
| @@ -250,6 +238,16 @@ config FTRACE_SYSCALLS | |||
| 250 | help | 238 | help |
| 251 | Basic tracer to catch the syscall entry and exit events. | 239 | Basic tracer to catch the syscall entry and exit events. |
| 252 | 240 | ||
| 241 | config TRACER_SNAPSHOT | ||
| 242 | bool "Create a snapshot trace buffer" | ||
| 243 | select TRACER_MAX_TRACE | ||
| 244 | help | ||
| 245 | Allow tracing users to take snapshot of the current buffer using the | ||
| 246 | ftrace interface, e.g.: | ||
| 247 | |||
| 248 | echo 1 > /sys/kernel/debug/tracing/snapshot | ||
| 249 | cat snapshot | ||
| 250 | |||
| 253 | config TRACE_BRANCH_PROFILING | 251 | config TRACE_BRANCH_PROFILING |
| 254 | bool | 252 | bool |
| 255 | select GENERIC_TRACER | 253 | select GENERIC_TRACER |
| @@ -434,6 +432,11 @@ config DYNAMIC_FTRACE | |||
| 434 | were made. If so, it runs stop_machine (stops all CPUS) | 432 | were made. If so, it runs stop_machine (stops all CPUS) |
| 435 | and modifies the code to jump over the call to ftrace. | 433 | and modifies the code to jump over the call to ftrace. |
| 436 | 434 | ||
| 435 | config DYNAMIC_FTRACE_WITH_REGS | ||
| 436 | def_bool y | ||
| 437 | depends on DYNAMIC_FTRACE | ||
| 438 | depends on HAVE_DYNAMIC_FTRACE_WITH_REGS | ||
| 439 | |||
| 437 | config FUNCTION_PROFILER | 440 | config FUNCTION_PROFILER |
| 438 | bool "Kernel function profiler" | 441 | bool "Kernel function profiler" |
| 439 | depends on FUNCTION_TRACER | 442 | depends on FUNCTION_TRACER |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb593f6a687e..9e5b8c272eec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) | |||
| 147 | return; | 147 | return; |
| 148 | 148 | ||
| 149 | local_irq_save(flags); | 149 | local_irq_save(flags); |
| 150 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); | 150 | buf = this_cpu_ptr(bt->msg_data); |
| 151 | va_start(args, fmt); | 151 | va_start(args, fmt); |
| 152 | n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); | 152 | n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); |
| 153 | va_end(args); | 153 | va_end(args); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ffe4c5ad3f3..ab25b88aae56 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); | |||
| 111 | #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) | 111 | #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) |
| 112 | #endif | 112 | #endif |
| 113 | 113 | ||
| 114 | /* | ||
| 115 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | ||
| 116 | * can use rcu_dereference_raw() is that elements removed from this list | ||
| 117 | * are simply leaked, so there is no need to interact with a grace-period | ||
| 118 | * mechanism. The rcu_dereference_raw() calls are needed to handle | ||
| 119 | * concurrent insertions into the ftrace_global_list. | ||
| 120 | * | ||
| 121 | * Silly Alpha and silly pointer-speculation compiler optimizations! | ||
| 122 | */ | ||
| 123 | #define do_for_each_ftrace_op(op, list) \ | ||
| 124 | op = rcu_dereference_raw(list); \ | ||
| 125 | do | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Optimized for just a single item in the list (as that is the normal case). | ||
| 129 | */ | ||
| 130 | #define while_for_each_ftrace_op(op) \ | ||
| 131 | while (likely(op = rcu_dereference_raw((op)->next)) && \ | ||
| 132 | unlikely((op) != &ftrace_list_end)) | ||
| 133 | |||
| 114 | /** | 134 | /** |
| 115 | * ftrace_nr_registered_ops - return number of ops registered | 135 | * ftrace_nr_registered_ops - return number of ops registered |
| 116 | * | 136 | * |
| @@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void) | |||
| 132 | return cnt; | 152 | return cnt; |
| 133 | } | 153 | } |
| 134 | 154 | ||
| 135 | /* | ||
| 136 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | ||
| 137 | * can use rcu_dereference_raw() is that elements removed from this list | ||
| 138 | * are simply leaked, so there is no need to interact with a grace-period | ||
| 139 | * mechanism. The rcu_dereference_raw() calls are needed to handle | ||
| 140 | * concurrent insertions into the ftrace_global_list. | ||
| 141 | * | ||
| 142 | * Silly Alpha and silly pointer-speculation compiler optimizations! | ||
| 143 | */ | ||
| 144 | static void | 155 | static void |
| 145 | ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, | 156 | ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, |
| 146 | struct ftrace_ops *op, struct pt_regs *regs) | 157 | struct ftrace_ops *op, struct pt_regs *regs) |
| 147 | { | 158 | { |
| 148 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) | 159 | int bit; |
| 160 | |||
| 161 | bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); | ||
| 162 | if (bit < 0) | ||
| 149 | return; | 163 | return; |
| 150 | 164 | ||
| 151 | trace_recursion_set(TRACE_GLOBAL_BIT); | 165 | do_for_each_ftrace_op(op, ftrace_global_list) { |
| 152 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ | ||
| 153 | while (op != &ftrace_list_end) { | ||
| 154 | op->func(ip, parent_ip, op, regs); | 166 | op->func(ip, parent_ip, op, regs); |
| 155 | op = rcu_dereference_raw(op->next); /*see above*/ | 167 | } while_for_each_ftrace_op(op); |
| 156 | }; | 168 | |
| 157 | trace_recursion_clear(TRACE_GLOBAL_BIT); | 169 | trace_clear_recursion(bit); |
| 158 | } | 170 | } |
| 159 | 171 | ||
| 160 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | 172 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
| @@ -221,10 +233,24 @@ static void update_global_ops(void) | |||
| 221 | * registered callers. | 233 | * registered callers. |
| 222 | */ | 234 | */ |
| 223 | if (ftrace_global_list == &ftrace_list_end || | 235 | if (ftrace_global_list == &ftrace_list_end || |
| 224 | ftrace_global_list->next == &ftrace_list_end) | 236 | ftrace_global_list->next == &ftrace_list_end) { |
| 225 | func = ftrace_global_list->func; | 237 | func = ftrace_global_list->func; |
| 226 | else | 238 | /* |
| 239 | * As we are calling the function directly. | ||
| 240 | * If it does not have recursion protection, | ||
| 241 | * the function_trace_op needs to be updated | ||
| 242 | * accordingly. | ||
| 243 | */ | ||
| 244 | if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) | ||
| 245 | global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; | ||
| 246 | else | ||
| 247 | global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; | ||
| 248 | } else { | ||
| 227 | func = ftrace_global_list_func; | 249 | func = ftrace_global_list_func; |
| 250 | /* The list has its own recursion protection. */ | ||
| 251 | global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; | ||
| 252 | } | ||
| 253 | |||
| 228 | 254 | ||
| 229 | /* If we filter on pids, update to use the pid function */ | 255 | /* If we filter on pids, update to use the pid function */ |
| 230 | if (!list_empty(&ftrace_pids)) { | 256 | if (!list_empty(&ftrace_pids)) { |
| @@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 337 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) | 363 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) |
| 338 | return -EINVAL; | 364 | return -EINVAL; |
| 339 | 365 | ||
| 340 | #ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS | 366 | #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS |
| 341 | /* | 367 | /* |
| 342 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used | 368 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used |
| 343 | * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. | 369 | * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. |
| @@ -736,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) | |||
| 736 | { | 762 | { |
| 737 | struct ftrace_profile *rec; | 763 | struct ftrace_profile *rec; |
| 738 | struct hlist_head *hhd; | 764 | struct hlist_head *hhd; |
| 739 | struct hlist_node *n; | ||
| 740 | unsigned long key; | 765 | unsigned long key; |
| 741 | 766 | ||
| 742 | key = hash_long(ip, ftrace_profile_bits); | 767 | key = hash_long(ip, ftrace_profile_bits); |
| @@ -745,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) | |||
| 745 | if (hlist_empty(hhd)) | 770 | if (hlist_empty(hhd)) |
| 746 | return NULL; | 771 | return NULL; |
| 747 | 772 | ||
| 748 | hlist_for_each_entry_rcu(rec, n, hhd, node) { | 773 | hlist_for_each_entry_rcu(rec, hhd, node) { |
| 749 | if (rec->ip == ip) | 774 | if (rec->ip == ip) |
| 750 | return rec; | 775 | return rec; |
| 751 | } | 776 | } |
| @@ -1107,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | |||
| 1107 | unsigned long key; | 1132 | unsigned long key; |
| 1108 | struct ftrace_func_entry *entry; | 1133 | struct ftrace_func_entry *entry; |
| 1109 | struct hlist_head *hhd; | 1134 | struct hlist_head *hhd; |
| 1110 | struct hlist_node *n; | ||
| 1111 | 1135 | ||
| 1112 | if (ftrace_hash_empty(hash)) | 1136 | if (ftrace_hash_empty(hash)) |
| 1113 | return NULL; | 1137 | return NULL; |
| @@ -1119,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | |||
| 1119 | 1143 | ||
| 1120 | hhd = &hash->buckets[key]; | 1144 | hhd = &hash->buckets[key]; |
| 1121 | 1145 | ||
| 1122 | hlist_for_each_entry_rcu(entry, n, hhd, hlist) { | 1146 | hlist_for_each_entry_rcu(entry, hhd, hlist) { |
| 1123 | if (entry->ip == ip) | 1147 | if (entry->ip == ip) |
| 1124 | return entry; | 1148 | return entry; |
| 1125 | } | 1149 | } |
| @@ -1176,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash, | |||
| 1176 | static void ftrace_hash_clear(struct ftrace_hash *hash) | 1200 | static void ftrace_hash_clear(struct ftrace_hash *hash) |
| 1177 | { | 1201 | { |
| 1178 | struct hlist_head *hhd; | 1202 | struct hlist_head *hhd; |
| 1179 | struct hlist_node *tp, *tn; | 1203 | struct hlist_node *tn; |
| 1180 | struct ftrace_func_entry *entry; | 1204 | struct ftrace_func_entry *entry; |
| 1181 | int size = 1 << hash->size_bits; | 1205 | int size = 1 << hash->size_bits; |
| 1182 | int i; | 1206 | int i; |
| @@ -1186,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) | |||
| 1186 | 1210 | ||
| 1187 | for (i = 0; i < size; i++) { | 1211 | for (i = 0; i < size; i++) { |
| 1188 | hhd = &hash->buckets[i]; | 1212 | hhd = &hash->buckets[i]; |
| 1189 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) | 1213 | hlist_for_each_entry_safe(entry, tn, hhd, hlist) |
| 1190 | free_hash_entry(hash, entry); | 1214 | free_hash_entry(hash, entry); |
| 1191 | } | 1215 | } |
| 1192 | FTRACE_WARN_ON(hash->count); | 1216 | FTRACE_WARN_ON(hash->count); |
| @@ -1249,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
| 1249 | { | 1273 | { |
| 1250 | struct ftrace_func_entry *entry; | 1274 | struct ftrace_func_entry *entry; |
| 1251 | struct ftrace_hash *new_hash; | 1275 | struct ftrace_hash *new_hash; |
| 1252 | struct hlist_node *tp; | ||
| 1253 | int size; | 1276 | int size; |
| 1254 | int ret; | 1277 | int ret; |
| 1255 | int i; | 1278 | int i; |
| @@ -1264,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
| 1264 | 1287 | ||
| 1265 | size = 1 << hash->size_bits; | 1288 | size = 1 << hash->size_bits; |
| 1266 | for (i = 0; i < size; i++) { | 1289 | for (i = 0; i < size; i++) { |
| 1267 | hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { | 1290 | hlist_for_each_entry(entry, &hash->buckets[i], hlist) { |
| 1268 | ret = add_hash_entry(new_hash, entry->ip); | 1291 | ret = add_hash_entry(new_hash, entry->ip); |
| 1269 | if (ret < 0) | 1292 | if (ret < 0) |
| 1270 | goto free_hash; | 1293 | goto free_hash; |
| @@ -1290,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1290 | struct ftrace_hash **dst, struct ftrace_hash *src) | 1313 | struct ftrace_hash **dst, struct ftrace_hash *src) |
| 1291 | { | 1314 | { |
| 1292 | struct ftrace_func_entry *entry; | 1315 | struct ftrace_func_entry *entry; |
| 1293 | struct hlist_node *tp, *tn; | 1316 | struct hlist_node *tn; |
| 1294 | struct hlist_head *hhd; | 1317 | struct hlist_head *hhd; |
| 1295 | struct ftrace_hash *old_hash; | 1318 | struct ftrace_hash *old_hash; |
| 1296 | struct ftrace_hash *new_hash; | 1319 | struct ftrace_hash *new_hash; |
| @@ -1336,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1336 | size = 1 << src->size_bits; | 1359 | size = 1 << src->size_bits; |
| 1337 | for (i = 0; i < size; i++) { | 1360 | for (i = 0; i < size; i++) { |
| 1338 | hhd = &src->buckets[i]; | 1361 | hhd = &src->buckets[i]; |
| 1339 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { | 1362 | hlist_for_each_entry_safe(entry, tn, hhd, hlist) { |
| 1340 | if (bits > 0) | 1363 | if (bits > 0) |
| 1341 | key = hash_long(entry->ip, bits); | 1364 | key = hash_long(entry->ip, bits); |
| 1342 | else | 1365 | else |
| @@ -2875,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | |||
| 2875 | { | 2898 | { |
| 2876 | struct ftrace_func_probe *entry; | 2899 | struct ftrace_func_probe *entry; |
| 2877 | struct hlist_head *hhd; | 2900 | struct hlist_head *hhd; |
| 2878 | struct hlist_node *n; | ||
| 2879 | unsigned long key; | 2901 | unsigned long key; |
| 2880 | 2902 | ||
| 2881 | key = hash_long(ip, FTRACE_HASH_BITS); | 2903 | key = hash_long(ip, FTRACE_HASH_BITS); |
| @@ -2891,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | |||
| 2891 | * on the hash. rcu_read_lock is too dangerous here. | 2913 | * on the hash. rcu_read_lock is too dangerous here. |
| 2892 | */ | 2914 | */ |
| 2893 | preempt_disable_notrace(); | 2915 | preempt_disable_notrace(); |
| 2894 | hlist_for_each_entry_rcu(entry, n, hhd, node) { | 2916 | hlist_for_each_entry_rcu(entry, hhd, node) { |
| 2895 | if (entry->ip == ip) | 2917 | if (entry->ip == ip) |
| 2896 | entry->ops->func(ip, parent_ip, &entry->data); | 2918 | entry->ops->func(ip, parent_ip, &entry->data); |
| 2897 | } | 2919 | } |
| @@ -3042,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3042 | void *data, int flags) | 3064 | void *data, int flags) |
| 3043 | { | 3065 | { |
| 3044 | struct ftrace_func_probe *entry; | 3066 | struct ftrace_func_probe *entry; |
| 3045 | struct hlist_node *n, *tmp; | 3067 | struct hlist_node *tmp; |
| 3046 | char str[KSYM_SYMBOL_LEN]; | 3068 | char str[KSYM_SYMBOL_LEN]; |
| 3047 | int type = MATCH_FULL; | 3069 | int type = MATCH_FULL; |
| 3048 | int i, len = 0; | 3070 | int i, len = 0; |
| @@ -3065,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3065 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { | 3087 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { |
| 3066 | struct hlist_head *hhd = &ftrace_func_hash[i]; | 3088 | struct hlist_head *hhd = &ftrace_func_hash[i]; |
| 3067 | 3089 | ||
| 3068 | hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { | 3090 | hlist_for_each_entry_safe(entry, tmp, hhd, node) { |
| 3069 | 3091 | ||
| 3070 | /* break up if statements for readability */ | 3092 | /* break up if statements for readability */ |
| 3071 | if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) | 3093 | if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) |
| @@ -3970,35 +3992,49 @@ static void ftrace_init_module(struct module *mod, | |||
| 3970 | ftrace_process_locs(mod, start, end); | 3992 | ftrace_process_locs(mod, start, end); |
| 3971 | } | 3993 | } |
| 3972 | 3994 | ||
| 3973 | static int ftrace_module_notify(struct notifier_block *self, | 3995 | static int ftrace_module_notify_enter(struct notifier_block *self, |
| 3974 | unsigned long val, void *data) | 3996 | unsigned long val, void *data) |
| 3975 | { | 3997 | { |
| 3976 | struct module *mod = data; | 3998 | struct module *mod = data; |
| 3977 | 3999 | ||
| 3978 | switch (val) { | 4000 | if (val == MODULE_STATE_COMING) |
| 3979 | case MODULE_STATE_COMING: | ||
| 3980 | ftrace_init_module(mod, mod->ftrace_callsites, | 4001 | ftrace_init_module(mod, mod->ftrace_callsites, |
| 3981 | mod->ftrace_callsites + | 4002 | mod->ftrace_callsites + |
| 3982 | mod->num_ftrace_callsites); | 4003 | mod->num_ftrace_callsites); |
| 3983 | break; | 4004 | return 0; |
| 3984 | case MODULE_STATE_GOING: | 4005 | } |
| 4006 | |||
| 4007 | static int ftrace_module_notify_exit(struct notifier_block *self, | ||
| 4008 | unsigned long val, void *data) | ||
| 4009 | { | ||
| 4010 | struct module *mod = data; | ||
| 4011 | |||
| 4012 | if (val == MODULE_STATE_GOING) | ||
| 3985 | ftrace_release_mod(mod); | 4013 | ftrace_release_mod(mod); |
| 3986 | break; | ||
| 3987 | } | ||
| 3988 | 4014 | ||
| 3989 | return 0; | 4015 | return 0; |
| 3990 | } | 4016 | } |
| 3991 | #else | 4017 | #else |
| 3992 | static int ftrace_module_notify(struct notifier_block *self, | 4018 | static int ftrace_module_notify_enter(struct notifier_block *self, |
| 3993 | unsigned long val, void *data) | 4019 | unsigned long val, void *data) |
| 4020 | { | ||
| 4021 | return 0; | ||
| 4022 | } | ||
| 4023 | static int ftrace_module_notify_exit(struct notifier_block *self, | ||
| 4024 | unsigned long val, void *data) | ||
| 3994 | { | 4025 | { |
| 3995 | return 0; | 4026 | return 0; |
| 3996 | } | 4027 | } |
| 3997 | #endif /* CONFIG_MODULES */ | 4028 | #endif /* CONFIG_MODULES */ |
| 3998 | 4029 | ||
| 3999 | struct notifier_block ftrace_module_nb = { | 4030 | struct notifier_block ftrace_module_enter_nb = { |
| 4000 | .notifier_call = ftrace_module_notify, | 4031 | .notifier_call = ftrace_module_notify_enter, |
| 4001 | .priority = 0, | 4032 | .priority = INT_MAX, /* Run before anything that can use kprobes */ |
| 4033 | }; | ||
| 4034 | |||
| 4035 | struct notifier_block ftrace_module_exit_nb = { | ||
| 4036 | .notifier_call = ftrace_module_notify_exit, | ||
| 4037 | .priority = INT_MIN, /* Run after anything that can remove kprobes */ | ||
| 4002 | }; | 4038 | }; |
| 4003 | 4039 | ||
| 4004 | extern unsigned long __start_mcount_loc[]; | 4040 | extern unsigned long __start_mcount_loc[]; |
| @@ -4032,9 +4068,13 @@ void __init ftrace_init(void) | |||
| 4032 | __start_mcount_loc, | 4068 | __start_mcount_loc, |
| 4033 | __stop_mcount_loc); | 4069 | __stop_mcount_loc); |
| 4034 | 4070 | ||
| 4035 | ret = register_module_notifier(&ftrace_module_nb); | 4071 | ret = register_module_notifier(&ftrace_module_enter_nb); |
| 4036 | if (ret) | 4072 | if (ret) |
| 4037 | pr_warning("Failed to register trace ftrace module notifier\n"); | 4073 | pr_warning("Failed to register trace ftrace module enter notifier\n"); |
| 4074 | |||
| 4075 | ret = register_module_notifier(&ftrace_module_exit_nb); | ||
| 4076 | if (ret) | ||
| 4077 | pr_warning("Failed to register trace ftrace module exit notifier\n"); | ||
| 4038 | 4078 | ||
| 4039 | set_ftrace_early_filters(); | 4079 | set_ftrace_early_filters(); |
| 4040 | 4080 | ||
| @@ -4090,14 +4130,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | |||
| 4090 | */ | 4130 | */ |
| 4091 | preempt_disable_notrace(); | 4131 | preempt_disable_notrace(); |
| 4092 | trace_recursion_set(TRACE_CONTROL_BIT); | 4132 | trace_recursion_set(TRACE_CONTROL_BIT); |
| 4093 | op = rcu_dereference_raw(ftrace_control_list); | 4133 | do_for_each_ftrace_op(op, ftrace_control_list) { |
| 4094 | while (op != &ftrace_list_end) { | ||
| 4095 | if (!ftrace_function_local_disabled(op) && | 4134 | if (!ftrace_function_local_disabled(op) && |
| 4096 | ftrace_ops_test(op, ip)) | 4135 | ftrace_ops_test(op, ip)) |
| 4097 | op->func(ip, parent_ip, op, regs); | 4136 | op->func(ip, parent_ip, op, regs); |
| 4098 | 4137 | } while_for_each_ftrace_op(op); | |
| 4099 | op = rcu_dereference_raw(op->next); | ||
| 4100 | }; | ||
| 4101 | trace_recursion_clear(TRACE_CONTROL_BIT); | 4138 | trace_recursion_clear(TRACE_CONTROL_BIT); |
| 4102 | preempt_enable_notrace(); | 4139 | preempt_enable_notrace(); |
| 4103 | } | 4140 | } |
| @@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
| 4112 | struct ftrace_ops *ignored, struct pt_regs *regs) | 4149 | struct ftrace_ops *ignored, struct pt_regs *regs) |
| 4113 | { | 4150 | { |
| 4114 | struct ftrace_ops *op; | 4151 | struct ftrace_ops *op; |
| 4152 | int bit; | ||
| 4115 | 4153 | ||
| 4116 | if (function_trace_stop) | 4154 | if (function_trace_stop) |
| 4117 | return; | 4155 | return; |
| 4118 | 4156 | ||
| 4119 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) | 4157 | bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); |
| 4158 | if (bit < 0) | ||
| 4120 | return; | 4159 | return; |
| 4121 | 4160 | ||
| 4122 | trace_recursion_set(TRACE_INTERNAL_BIT); | ||
| 4123 | /* | 4161 | /* |
| 4124 | * Some of the ops may be dynamically allocated, | 4162 | * Some of the ops may be dynamically allocated, |
| 4125 | * they must be freed after a synchronize_sched(). | 4163 | * they must be freed after a synchronize_sched(). |
| 4126 | */ | 4164 | */ |
| 4127 | preempt_disable_notrace(); | 4165 | preempt_disable_notrace(); |
| 4128 | op = rcu_dereference_raw(ftrace_ops_list); | 4166 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 4129 | while (op != &ftrace_list_end) { | ||
| 4130 | if (ftrace_ops_test(op, ip)) | 4167 | if (ftrace_ops_test(op, ip)) |
| 4131 | op->func(ip, parent_ip, op, regs); | 4168 | op->func(ip, parent_ip, op, regs); |
| 4132 | op = rcu_dereference_raw(op->next); | 4169 | } while_for_each_ftrace_op(op); |
| 4133 | }; | ||
| 4134 | preempt_enable_notrace(); | 4170 | preempt_enable_notrace(); |
| 4135 | trace_recursion_clear(TRACE_INTERNAL_BIT); | 4171 | trace_clear_recursion(bit); |
| 4136 | } | 4172 | } |
| 4137 | 4173 | ||
| 4138 | /* | 4174 | /* |
| @@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
| 4143 | * Archs are to support both the regs and ftrace_ops at the same time. | 4179 | * Archs are to support both the regs and ftrace_ops at the same time. |
| 4144 | * If they support ftrace_ops, it is assumed they support regs. | 4180 | * If they support ftrace_ops, it is assumed they support regs. |
| 4145 | * If call backs want to use regs, they must either check for regs | 4181 | * If call backs want to use regs, they must either check for regs |
| 4146 | * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. | 4182 | * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. |
| 4147 | * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. | 4183 | * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. |
| 4148 | * An architecture can pass partial regs with ftrace_ops and still | 4184 | * An architecture can pass partial regs with ftrace_ops and still |
| 4149 | * set the ARCH_SUPPORT_FTARCE_OPS. | 4185 | * set the ARCH_SUPPORT_FTARCE_OPS. |
| 4150 | */ | 4186 | */ |
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index f55fcf61b223..1c71382b283d 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
| @@ -13,8 +13,5 @@ | |||
| 13 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
| 14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
| 15 | 15 | ||
| 16 | #ifdef EVENT_POWER_TRACING_DEPRECATED | ||
| 17 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); | ||
| 18 | #endif | ||
| 19 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); | 16 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); |
| 20 | 17 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedcd..7244acde77b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -3,8 +3,10 @@ | |||
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> | 4 | * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> |
| 5 | */ | 5 | */ |
| 6 | #include <linux/ftrace_event.h> | ||
| 6 | #include <linux/ring_buffer.h> | 7 | #include <linux/ring_buffer.h> |
| 7 | #include <linux/trace_clock.h> | 8 | #include <linux/trace_clock.h> |
| 9 | #include <linux/trace_seq.h> | ||
| 8 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
| 9 | #include <linux/debugfs.h> | 11 | #include <linux/debugfs.h> |
| 10 | #include <linux/uaccess.h> | 12 | #include <linux/uaccess.h> |
| @@ -21,7 +23,6 @@ | |||
| 21 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
| 22 | 24 | ||
| 23 | #include <asm/local.h> | 25 | #include <asm/local.h> |
| 24 | #include "trace.h" | ||
| 25 | 26 | ||
| 26 | static void update_pages_handler(struct work_struct *work); | 27 | static void update_pages_handler(struct work_struct *work); |
| 27 | 28 | ||
| @@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
| 2432 | 2433 | ||
| 2433 | #ifdef CONFIG_TRACING | 2434 | #ifdef CONFIG_TRACING |
| 2434 | 2435 | ||
| 2435 | #define TRACE_RECURSIVE_DEPTH 16 | 2436 | /* |
| 2437 | * The lock and unlock are done within a preempt disable section. | ||
| 2438 | * The current_context per_cpu variable can only be modified | ||
| 2439 | * by the current task between lock and unlock. But it can | ||
| 2440 | * be modified more than once via an interrupt. To pass this | ||
| 2441 | * information from the lock to the unlock without having to | ||
| 2442 | * access the 'in_interrupt()' functions again (which do show | ||
| 2443 | * a bit of overhead in something as critical as function tracing, | ||
| 2444 | * we use a bitmask trick. | ||
| 2445 | * | ||
| 2446 | * bit 0 = NMI context | ||
| 2447 | * bit 1 = IRQ context | ||
| 2448 | * bit 2 = SoftIRQ context | ||
| 2449 | * bit 3 = normal context. | ||
| 2450 | * | ||
| 2451 | * This works because this is the order of contexts that can | ||
| 2452 | * preempt other contexts. A SoftIRQ never preempts an IRQ | ||
| 2453 | * context. | ||
| 2454 | * | ||
| 2455 | * When the context is determined, the corresponding bit is | ||
| 2456 | * checked and set (if it was set, then a recursion of that context | ||
| 2457 | * happened). | ||
| 2458 | * | ||
| 2459 | * On unlock, we need to clear this bit. To do so, just subtract | ||
| 2460 | * 1 from the current_context and AND it to itself. | ||
| 2461 | * | ||
| 2462 | * (binary) | ||
| 2463 | * 101 - 1 = 100 | ||
| 2464 | * 101 & 100 = 100 (clearing bit zero) | ||
| 2465 | * | ||
| 2466 | * 1010 - 1 = 1001 | ||
| 2467 | * 1010 & 1001 = 1000 (clearing bit 1) | ||
| 2468 | * | ||
| 2469 | * The least significant bit can be cleared this way, and it | ||
| 2470 | * just so happens that it is the same bit corresponding to | ||
| 2471 | * the current context. | ||
| 2472 | */ | ||
| 2473 | static DEFINE_PER_CPU(unsigned int, current_context); | ||
| 2436 | 2474 | ||
| 2437 | /* Keep this code out of the fast path cache */ | 2475 | static __always_inline int trace_recursive_lock(void) |
| 2438 | static noinline void trace_recursive_fail(void) | ||
| 2439 | { | 2476 | { |
| 2440 | /* Disable all tracing before we do anything else */ | 2477 | unsigned int val = this_cpu_read(current_context); |
| 2441 | tracing_off_permanent(); | 2478 | int bit; |
| 2442 | |||
| 2443 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" | ||
| 2444 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", | ||
| 2445 | trace_recursion_buffer(), | ||
| 2446 | hardirq_count() >> HARDIRQ_SHIFT, | ||
| 2447 | softirq_count() >> SOFTIRQ_SHIFT, | ||
| 2448 | in_nmi()); | ||
| 2449 | |||
| 2450 | WARN_ON_ONCE(1); | ||
| 2451 | } | ||
| 2452 | 2479 | ||
| 2453 | static inline int trace_recursive_lock(void) | 2480 | if (in_interrupt()) { |
| 2454 | { | 2481 | if (in_nmi()) |
| 2455 | trace_recursion_inc(); | 2482 | bit = 0; |
| 2483 | else if (in_irq()) | ||
| 2484 | bit = 1; | ||
| 2485 | else | ||
| 2486 | bit = 2; | ||
| 2487 | } else | ||
| 2488 | bit = 3; | ||
| 2456 | 2489 | ||
| 2457 | if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) | 2490 | if (unlikely(val & (1 << bit))) |
| 2458 | return 0; | 2491 | return 1; |
| 2459 | 2492 | ||
| 2460 | trace_recursive_fail(); | 2493 | val |= (1 << bit); |
| 2494 | this_cpu_write(current_context, val); | ||
| 2461 | 2495 | ||
| 2462 | return -1; | 2496 | return 0; |
| 2463 | } | 2497 | } |
| 2464 | 2498 | ||
| 2465 | static inline void trace_recursive_unlock(void) | 2499 | static __always_inline void trace_recursive_unlock(void) |
| 2466 | { | 2500 | { |
| 2467 | WARN_ON_ONCE(!trace_recursion_buffer()); | 2501 | unsigned int val = this_cpu_read(current_context); |
| 2468 | 2502 | ||
| 2469 | trace_recursion_dec(); | 2503 | val--; |
| 2504 | val &= this_cpu_read(current_context); | ||
| 2505 | this_cpu_write(current_context, val); | ||
| 2470 | } | 2506 | } |
| 2471 | 2507 | ||
| 2472 | #else | 2508 | #else |
| @@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) | |||
| 3067 | EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); | 3103 | EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); |
| 3068 | 3104 | ||
| 3069 | /** | 3105 | /** |
| 3106 | * ring_buffer_read_events_cpu - get the number of events successfully read | ||
| 3107 | * @buffer: The ring buffer | ||
| 3108 | * @cpu: The per CPU buffer to get the number of events read | ||
| 3109 | */ | ||
| 3110 | unsigned long | ||
| 3111 | ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) | ||
| 3112 | { | ||
| 3113 | struct ring_buffer_per_cpu *cpu_buffer; | ||
| 3114 | |||
| 3115 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
| 3116 | return 0; | ||
| 3117 | |||
| 3118 | cpu_buffer = buffer->buffers[cpu]; | ||
| 3119 | return cpu_buffer->read; | ||
| 3120 | } | ||
| 3121 | EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); | ||
| 3122 | |||
| 3123 | /** | ||
| 3070 | * ring_buffer_entries - get the number of entries in a buffer | 3124 | * ring_buffer_entries - get the number of entries in a buffer |
| 3071 | * @buffer: The ring buffer | 3125 | * @buffer: The ring buffer |
| 3072 | * | 3126 | * |
| @@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) | |||
| 3425 | /* check for end of page padding */ | 3479 | /* check for end of page padding */ |
| 3426 | if ((iter->head >= rb_page_size(iter->head_page)) && | 3480 | if ((iter->head >= rb_page_size(iter->head_page)) && |
| 3427 | (iter->head_page != cpu_buffer->commit_page)) | 3481 | (iter->head_page != cpu_buffer->commit_page)) |
| 3428 | rb_advance_iter(iter); | 3482 | rb_inc_iter(iter); |
| 3429 | } | 3483 | } |
| 3430 | 3484 | ||
| 3431 | static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) | 3485 | static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5125677efa0..c2e2c2310374 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/poll.h> | 39 | #include <linux/poll.h> |
| 40 | #include <linux/nmi.h> | 40 | #include <linux/nmi.h> |
| 41 | #include <linux/fs.h> | 41 | #include <linux/fs.h> |
| 42 | #include <linux/sched/rt.h> | ||
| 42 | 43 | ||
| 43 | #include "trace.h" | 44 | #include "trace.h" |
| 44 | #include "trace_output.h" | 45 | #include "trace_output.h" |
| @@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; | |||
| 249 | static struct tracer *trace_types __read_mostly; | 250 | static struct tracer *trace_types __read_mostly; |
| 250 | 251 | ||
| 251 | /* current_trace points to the tracer that is currently active */ | 252 | /* current_trace points to the tracer that is currently active */ |
| 252 | static struct tracer *current_trace __read_mostly; | 253 | static struct tracer *current_trace __read_mostly = &nop_trace; |
| 253 | 254 | ||
| 254 | /* | 255 | /* |
| 255 | * trace_types_lock is used to protect the trace_types list. | 256 | * trace_types_lock is used to protect the trace_types list. |
| @@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 709 | return; | 710 | return; |
| 710 | 711 | ||
| 711 | WARN_ON_ONCE(!irqs_disabled()); | 712 | WARN_ON_ONCE(!irqs_disabled()); |
| 712 | if (!current_trace->use_max_tr) { | 713 | |
| 713 | WARN_ON_ONCE(1); | 714 | if (!current_trace->allocated_snapshot) { |
| 715 | /* Only the nop tracer should hit this when disabling */ | ||
| 716 | WARN_ON_ONCE(current_trace != &nop_trace); | ||
| 714 | return; | 717 | return; |
| 715 | } | 718 | } |
| 719 | |||
| 716 | arch_spin_lock(&ftrace_max_lock); | 720 | arch_spin_lock(&ftrace_max_lock); |
| 717 | 721 | ||
| 718 | tr->buffer = max_tr.buffer; | 722 | tr->buffer = max_tr.buffer; |
| @@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 739 | return; | 743 | return; |
| 740 | 744 | ||
| 741 | WARN_ON_ONCE(!irqs_disabled()); | 745 | WARN_ON_ONCE(!irqs_disabled()); |
| 742 | if (!current_trace->use_max_tr) { | 746 | if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) |
| 743 | WARN_ON_ONCE(1); | ||
| 744 | return; | 747 | return; |
| 745 | } | ||
| 746 | 748 | ||
| 747 | arch_spin_lock(&ftrace_max_lock); | 749 | arch_spin_lock(&ftrace_max_lock); |
| 748 | 750 | ||
| @@ -862,10 +864,13 @@ int register_tracer(struct tracer *type) | |||
| 862 | 864 | ||
| 863 | current_trace = type; | 865 | current_trace = type; |
| 864 | 866 | ||
| 865 | /* If we expanded the buffers, make sure the max is expanded too */ | 867 | if (type->use_max_tr) { |
| 866 | if (ring_buffer_expanded && type->use_max_tr) | 868 | /* If we expanded the buffers, make sure the max is expanded too */ |
| 867 | ring_buffer_resize(max_tr.buffer, trace_buf_size, | 869 | if (ring_buffer_expanded) |
| 868 | RING_BUFFER_ALL_CPUS); | 870 | ring_buffer_resize(max_tr.buffer, trace_buf_size, |
| 871 | RING_BUFFER_ALL_CPUS); | ||
| 872 | type->allocated_snapshot = true; | ||
| 873 | } | ||
| 869 | 874 | ||
| 870 | /* the test is responsible for initializing and enabling */ | 875 | /* the test is responsible for initializing and enabling */ |
| 871 | pr_info("Testing tracer %s: ", type->name); | 876 | pr_info("Testing tracer %s: ", type->name); |
| @@ -881,10 +886,14 @@ int register_tracer(struct tracer *type) | |||
| 881 | /* Only reset on passing, to avoid touching corrupted buffers */ | 886 | /* Only reset on passing, to avoid touching corrupted buffers */ |
| 882 | tracing_reset_online_cpus(tr); | 887 | tracing_reset_online_cpus(tr); |
| 883 | 888 | ||
| 884 | /* Shrink the max buffer again */ | 889 | if (type->use_max_tr) { |
| 885 | if (ring_buffer_expanded && type->use_max_tr) | 890 | type->allocated_snapshot = false; |
| 886 | ring_buffer_resize(max_tr.buffer, 1, | 891 | |
| 887 | RING_BUFFER_ALL_CPUS); | 892 | /* Shrink the max buffer again */ |
| 893 | if (ring_buffer_expanded) | ||
| 894 | ring_buffer_resize(max_tr.buffer, 1, | ||
| 895 | RING_BUFFER_ALL_CPUS); | ||
| 896 | } | ||
| 888 | 897 | ||
| 889 | printk(KERN_CONT "PASSED\n"); | 898 | printk(KERN_CONT "PASSED\n"); |
| 890 | } | 899 | } |
| @@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
| 922 | { | 931 | { |
| 923 | struct ring_buffer *buffer = tr->buffer; | 932 | struct ring_buffer *buffer = tr->buffer; |
| 924 | 933 | ||
| 934 | if (!buffer) | ||
| 935 | return; | ||
| 936 | |||
| 925 | ring_buffer_record_disable(buffer); | 937 | ring_buffer_record_disable(buffer); |
| 926 | 938 | ||
| 927 | /* Make sure all commits have finished */ | 939 | /* Make sure all commits have finished */ |
| @@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
| 936 | struct ring_buffer *buffer = tr->buffer; | 948 | struct ring_buffer *buffer = tr->buffer; |
| 937 | int cpu; | 949 | int cpu; |
| 938 | 950 | ||
| 951 | if (!buffer) | ||
| 952 | return; | ||
| 953 | |||
| 939 | ring_buffer_record_disable(buffer); | 954 | ring_buffer_record_disable(buffer); |
| 940 | 955 | ||
| 941 | /* Make sure all commits have finished */ | 956 | /* Make sure all commits have finished */ |
| @@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
| 1167 | 1182 | ||
| 1168 | entry->preempt_count = pc & 0xff; | 1183 | entry->preempt_count = pc & 0xff; |
| 1169 | entry->pid = (tsk) ? tsk->pid : 0; | 1184 | entry->pid = (tsk) ? tsk->pid : 0; |
| 1170 | entry->padding = 0; | ||
| 1171 | entry->flags = | 1185 | entry->flags = |
| 1172 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | 1186 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT |
| 1173 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | | 1187 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | |
| @@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
| 1335 | */ | 1349 | */ |
| 1336 | preempt_disable_notrace(); | 1350 | preempt_disable_notrace(); |
| 1337 | 1351 | ||
| 1338 | use_stack = ++__get_cpu_var(ftrace_stack_reserve); | 1352 | use_stack = __this_cpu_inc_return(ftrace_stack_reserve); |
| 1339 | /* | 1353 | /* |
| 1340 | * We don't need any atomic variables, just a barrier. | 1354 | * We don't need any atomic variables, just a barrier. |
| 1341 | * If an interrupt comes in, we don't care, because it would | 1355 | * If an interrupt comes in, we don't care, because it would |
| @@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
| 1389 | out: | 1403 | out: |
| 1390 | /* Again, don't let gcc optimize things here */ | 1404 | /* Again, don't let gcc optimize things here */ |
| 1391 | barrier(); | 1405 | barrier(); |
| 1392 | __get_cpu_var(ftrace_stack_reserve)--; | 1406 | __this_cpu_dec(ftrace_stack_reserve); |
| 1393 | preempt_enable_notrace(); | 1407 | preempt_enable_notrace(); |
| 1394 | 1408 | ||
| 1395 | } | 1409 | } |
| @@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer; | |||
| 1517 | static char *get_trace_buf(void) | 1531 | static char *get_trace_buf(void) |
| 1518 | { | 1532 | { |
| 1519 | struct trace_buffer_struct *percpu_buffer; | 1533 | struct trace_buffer_struct *percpu_buffer; |
| 1520 | struct trace_buffer_struct *buffer; | ||
| 1521 | 1534 | ||
| 1522 | /* | 1535 | /* |
| 1523 | * If we have allocated per cpu buffers, then we do not | 1536 | * If we have allocated per cpu buffers, then we do not |
| @@ -1535,9 +1548,7 @@ static char *get_trace_buf(void) | |||
| 1535 | if (!percpu_buffer) | 1548 | if (!percpu_buffer) |
| 1536 | return NULL; | 1549 | return NULL; |
| 1537 | 1550 | ||
| 1538 | buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); | 1551 | return this_cpu_ptr(&percpu_buffer->buffer[0]); |
| 1539 | |||
| 1540 | return buffer->buffer; | ||
| 1541 | } | 1552 | } |
| 1542 | 1553 | ||
| 1543 | static int alloc_percpu_trace_buffer(void) | 1554 | static int alloc_percpu_trace_buffer(void) |
| @@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
| 1942 | static void *s_start(struct seq_file *m, loff_t *pos) | 1953 | static void *s_start(struct seq_file *m, loff_t *pos) |
| 1943 | { | 1954 | { |
| 1944 | struct trace_iterator *iter = m->private; | 1955 | struct trace_iterator *iter = m->private; |
| 1945 | static struct tracer *old_tracer; | ||
| 1946 | int cpu_file = iter->cpu_file; | 1956 | int cpu_file = iter->cpu_file; |
| 1947 | void *p = NULL; | 1957 | void *p = NULL; |
| 1948 | loff_t l = 0; | 1958 | loff_t l = 0; |
| 1949 | int cpu; | 1959 | int cpu; |
| 1950 | 1960 | ||
| 1951 | /* copy the tracer to avoid using a global lock all around */ | 1961 | /* |
| 1962 | * copy the tracer to avoid using a global lock all around. | ||
| 1963 | * iter->trace is a copy of current_trace, the pointer to the | ||
| 1964 | * name may be used instead of a strcmp(), as iter->trace->name | ||
| 1965 | * will point to the same string as current_trace->name. | ||
| 1966 | */ | ||
| 1952 | mutex_lock(&trace_types_lock); | 1967 | mutex_lock(&trace_types_lock); |
| 1953 | if (unlikely(old_tracer != current_trace && current_trace)) { | 1968 | if (unlikely(current_trace && iter->trace->name != current_trace->name)) |
| 1954 | old_tracer = current_trace; | ||
| 1955 | *iter->trace = *current_trace; | 1969 | *iter->trace = *current_trace; |
| 1956 | } | ||
| 1957 | mutex_unlock(&trace_types_lock); | 1970 | mutex_unlock(&trace_types_lock); |
| 1958 | 1971 | ||
| 1959 | atomic_inc(&trace_record_cmdline_disabled); | 1972 | if (iter->snapshot && iter->trace->use_max_tr) |
| 1973 | return ERR_PTR(-EBUSY); | ||
| 1974 | |||
| 1975 | if (!iter->snapshot) | ||
| 1976 | atomic_inc(&trace_record_cmdline_disabled); | ||
| 1960 | 1977 | ||
| 1961 | if (*pos != iter->pos) { | 1978 | if (*pos != iter->pos) { |
| 1962 | iter->ent = NULL; | 1979 | iter->ent = NULL; |
| @@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p) | |||
| 1995 | { | 2012 | { |
| 1996 | struct trace_iterator *iter = m->private; | 2013 | struct trace_iterator *iter = m->private; |
| 1997 | 2014 | ||
| 1998 | atomic_dec(&trace_record_cmdline_disabled); | 2015 | if (iter->snapshot && iter->trace->use_max_tr) |
| 2016 | return; | ||
| 2017 | |||
| 2018 | if (!iter->snapshot) | ||
| 2019 | atomic_dec(&trace_record_cmdline_disabled); | ||
| 1999 | trace_access_unlock(iter->cpu_file); | 2020 | trace_access_unlock(iter->cpu_file); |
| 2000 | trace_event_read_unlock(); | 2021 | trace_event_read_unlock(); |
| 2001 | } | 2022 | } |
| @@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
| 2080 | unsigned long total; | 2101 | unsigned long total; |
| 2081 | const char *name = "preemption"; | 2102 | const char *name = "preemption"; |
| 2082 | 2103 | ||
| 2083 | if (type) | 2104 | name = type->name; |
| 2084 | name = type->name; | ||
| 2085 | 2105 | ||
| 2086 | get_total_entries(tr, &total, &entries); | 2106 | get_total_entries(tr, &total, &entries); |
| 2087 | 2107 | ||
| @@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = { | |||
| 2430 | }; | 2450 | }; |
| 2431 | 2451 | ||
| 2432 | static struct trace_iterator * | 2452 | static struct trace_iterator * |
| 2433 | __tracing_open(struct inode *inode, struct file *file) | 2453 | __tracing_open(struct inode *inode, struct file *file, bool snapshot) |
| 2434 | { | 2454 | { |
| 2435 | long cpu_file = (long) inode->i_private; | 2455 | long cpu_file = (long) inode->i_private; |
| 2436 | struct trace_iterator *iter; | 2456 | struct trace_iterator *iter; |
| @@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file) | |||
| 2457 | if (!iter->trace) | 2477 | if (!iter->trace) |
| 2458 | goto fail; | 2478 | goto fail; |
| 2459 | 2479 | ||
| 2460 | if (current_trace) | 2480 | *iter->trace = *current_trace; |
| 2461 | *iter->trace = *current_trace; | ||
| 2462 | 2481 | ||
| 2463 | if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) | 2482 | if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) |
| 2464 | goto fail; | 2483 | goto fail; |
| 2465 | 2484 | ||
| 2466 | if (current_trace && current_trace->print_max) | 2485 | if (current_trace->print_max || snapshot) |
| 2467 | iter->tr = &max_tr; | 2486 | iter->tr = &max_tr; |
| 2468 | else | 2487 | else |
| 2469 | iter->tr = &global_trace; | 2488 | iter->tr = &global_trace; |
| 2489 | iter->snapshot = snapshot; | ||
| 2470 | iter->pos = -1; | 2490 | iter->pos = -1; |
| 2471 | mutex_init(&iter->mutex); | 2491 | mutex_init(&iter->mutex); |
| 2472 | iter->cpu_file = cpu_file; | 2492 | iter->cpu_file = cpu_file; |
| @@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file) | |||
| 2483 | if (trace_clocks[trace_clock_id].in_ns) | 2503 | if (trace_clocks[trace_clock_id].in_ns) |
| 2484 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | 2504 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; |
| 2485 | 2505 | ||
| 2486 | /* stop the trace while dumping */ | 2506 | /* stop the trace while dumping if we are not opening "snapshot" */ |
| 2487 | tracing_stop(); | 2507 | if (!iter->snapshot) |
| 2508 | tracing_stop(); | ||
| 2488 | 2509 | ||
| 2489 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { | 2510 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { |
| 2490 | for_each_tracing_cpu(cpu) { | 2511 | for_each_tracing_cpu(cpu) { |
| @@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
| 2547 | if (iter->trace && iter->trace->close) | 2568 | if (iter->trace && iter->trace->close) |
| 2548 | iter->trace->close(iter); | 2569 | iter->trace->close(iter); |
| 2549 | 2570 | ||
| 2550 | /* reenable tracing if it was previously enabled */ | 2571 | if (!iter->snapshot) |
| 2551 | tracing_start(); | 2572 | /* reenable tracing if it was previously enabled */ |
| 2573 | tracing_start(); | ||
| 2552 | mutex_unlock(&trace_types_lock); | 2574 | mutex_unlock(&trace_types_lock); |
| 2553 | 2575 | ||
| 2554 | mutex_destroy(&iter->mutex); | 2576 | mutex_destroy(&iter->mutex); |
| @@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file) | |||
| 2576 | } | 2598 | } |
| 2577 | 2599 | ||
| 2578 | if (file->f_mode & FMODE_READ) { | 2600 | if (file->f_mode & FMODE_READ) { |
| 2579 | iter = __tracing_open(inode, file); | 2601 | iter = __tracing_open(inode, file, false); |
| 2580 | if (IS_ERR(iter)) | 2602 | if (IS_ERR(iter)) |
| 2581 | ret = PTR_ERR(iter); | 2603 | ret = PTR_ERR(iter); |
| 2582 | else if (trace_flags & TRACE_ITER_LATENCY_FMT) | 2604 | else if (trace_flags & TRACE_ITER_LATENCY_FMT) |
| @@ -2899,6 +2921,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
| 2899 | if (copy_from_user(&buf, ubuf, cnt)) | 2921 | if (copy_from_user(&buf, ubuf, cnt)) |
| 2900 | return -EFAULT; | 2922 | return -EFAULT; |
| 2901 | 2923 | ||
| 2924 | buf[cnt] = 0; | ||
| 2925 | |||
| 2902 | trace_set_options(buf); | 2926 | trace_set_options(buf); |
| 2903 | 2927 | ||
| 2904 | *ppos += cnt; | 2928 | *ppos += cnt; |
| @@ -3012,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, | |||
| 3012 | int r; | 3036 | int r; |
| 3013 | 3037 | ||
| 3014 | mutex_lock(&trace_types_lock); | 3038 | mutex_lock(&trace_types_lock); |
| 3015 | if (current_trace) | 3039 | r = sprintf(buf, "%s\n", current_trace->name); |
| 3016 | r = sprintf(buf, "%s\n", current_trace->name); | ||
| 3017 | else | ||
| 3018 | r = sprintf(buf, "\n"); | ||
| 3019 | mutex_unlock(&trace_types_lock); | 3040 | mutex_unlock(&trace_types_lock); |
| 3020 | 3041 | ||
| 3021 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 3042 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
| @@ -3181,6 +3202,7 @@ static int tracing_set_tracer(const char *buf) | |||
| 3181 | static struct trace_option_dentry *topts; | 3202 | static struct trace_option_dentry *topts; |
| 3182 | struct trace_array *tr = &global_trace; | 3203 | struct trace_array *tr = &global_trace; |
| 3183 | struct tracer *t; | 3204 | struct tracer *t; |
| 3205 | bool had_max_tr; | ||
| 3184 | int ret = 0; | 3206 | int ret = 0; |
| 3185 | 3207 | ||
| 3186 | mutex_lock(&trace_types_lock); | 3208 | mutex_lock(&trace_types_lock); |
| @@ -3205,9 +3227,21 @@ static int tracing_set_tracer(const char *buf) | |||
| 3205 | goto out; | 3227 | goto out; |
| 3206 | 3228 | ||
| 3207 | trace_branch_disable(); | 3229 | trace_branch_disable(); |
| 3208 | if (current_trace && current_trace->reset) | 3230 | if (current_trace->reset) |
| 3209 | current_trace->reset(tr); | 3231 | current_trace->reset(tr); |
| 3210 | if (current_trace && current_trace->use_max_tr) { | 3232 | |
| 3233 | had_max_tr = current_trace->allocated_snapshot; | ||
| 3234 | current_trace = &nop_trace; | ||
| 3235 | |||
| 3236 | if (had_max_tr && !t->use_max_tr) { | ||
| 3237 | /* | ||
| 3238 | * We need to make sure that the update_max_tr sees that | ||
| 3239 | * current_trace changed to nop_trace to keep it from | ||
| 3240 | * swapping the buffers after we resize it. | ||
| 3241 | * The update_max_tr is called from interrupts disabled | ||
| 3242 | * so a synchronized_sched() is sufficient. | ||
| 3243 | */ | ||
| 3244 | synchronize_sched(); | ||
| 3211 | /* | 3245 | /* |
| 3212 | * We don't free the ring buffer. instead, resize it because | 3246 | * We don't free the ring buffer. instead, resize it because |
| 3213 | * The max_tr ring buffer has some state (e.g. ring->clock) and | 3247 | * The max_tr ring buffer has some state (e.g. ring->clock) and |
| @@ -3215,18 +3249,19 @@ static int tracing_set_tracer(const char *buf) | |||
| 3215 | */ | 3249 | */ |
| 3216 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); | 3250 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); |
| 3217 | set_buffer_entries(&max_tr, 1); | 3251 | set_buffer_entries(&max_tr, 1); |
| 3252 | tracing_reset_online_cpus(&max_tr); | ||
| 3253 | current_trace->allocated_snapshot = false; | ||
| 3218 | } | 3254 | } |
| 3219 | destroy_trace_option_files(topts); | 3255 | destroy_trace_option_files(topts); |
| 3220 | 3256 | ||
| 3221 | current_trace = &nop_trace; | ||
| 3222 | |||
| 3223 | topts = create_trace_option_files(t); | 3257 | topts = create_trace_option_files(t); |
| 3224 | if (t->use_max_tr) { | 3258 | if (t->use_max_tr && !had_max_tr) { |
| 3225 | /* we need to make per cpu buffer sizes equivalent */ | 3259 | /* we need to make per cpu buffer sizes equivalent */ |
| 3226 | ret = resize_buffer_duplicate_size(&max_tr, &global_trace, | 3260 | ret = resize_buffer_duplicate_size(&max_tr, &global_trace, |
| 3227 | RING_BUFFER_ALL_CPUS); | 3261 | RING_BUFFER_ALL_CPUS); |
| 3228 | if (ret < 0) | 3262 | if (ret < 0) |
| 3229 | goto out; | 3263 | goto out; |
| 3264 | t->allocated_snapshot = true; | ||
| 3230 | } | 3265 | } |
| 3231 | 3266 | ||
| 3232 | if (t->init) { | 3267 | if (t->init) { |
| @@ -3334,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
| 3334 | ret = -ENOMEM; | 3369 | ret = -ENOMEM; |
| 3335 | goto fail; | 3370 | goto fail; |
| 3336 | } | 3371 | } |
| 3337 | if (current_trace) | 3372 | *iter->trace = *current_trace; |
| 3338 | *iter->trace = *current_trace; | ||
| 3339 | 3373 | ||
| 3340 | if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { | 3374 | if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { |
| 3341 | ret = -ENOMEM; | 3375 | ret = -ENOMEM; |
| @@ -3452,7 +3486,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
| 3452 | return -EINTR; | 3486 | return -EINTR; |
| 3453 | 3487 | ||
| 3454 | /* | 3488 | /* |
| 3455 | * We block until we read something and tracing is enabled. | 3489 | * We block until we read something and tracing is disabled. |
| 3456 | * We still block if tracing is disabled, but we have never | 3490 | * We still block if tracing is disabled, but we have never |
| 3457 | * read anything. This allows a user to cat this file, and | 3491 | * read anything. This allows a user to cat this file, and |
| 3458 | * then enable tracing. But after we have read something, | 3492 | * then enable tracing. But after we have read something, |
| @@ -3460,7 +3494,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
| 3460 | * | 3494 | * |
| 3461 | * iter->pos will be 0 if we haven't read anything. | 3495 | * iter->pos will be 0 if we haven't read anything. |
| 3462 | */ | 3496 | */ |
| 3463 | if (tracing_is_enabled() && iter->pos) | 3497 | if (!tracing_is_enabled() && iter->pos) |
| 3464 | break; | 3498 | break; |
| 3465 | } | 3499 | } |
| 3466 | 3500 | ||
| @@ -3475,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, | |||
| 3475 | size_t cnt, loff_t *ppos) | 3509 | size_t cnt, loff_t *ppos) |
| 3476 | { | 3510 | { |
| 3477 | struct trace_iterator *iter = filp->private_data; | 3511 | struct trace_iterator *iter = filp->private_data; |
| 3478 | static struct tracer *old_tracer; | ||
| 3479 | ssize_t sret; | 3512 | ssize_t sret; |
| 3480 | 3513 | ||
| 3481 | /* return any leftover data */ | 3514 | /* return any leftover data */ |
| @@ -3487,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, | |||
| 3487 | 3520 | ||
| 3488 | /* copy the tracer to avoid using a global lock all around */ | 3521 | /* copy the tracer to avoid using a global lock all around */ |
| 3489 | mutex_lock(&trace_types_lock); | 3522 | mutex_lock(&trace_types_lock); |
| 3490 | if (unlikely(old_tracer != current_trace && current_trace)) { | 3523 | if (unlikely(iter->trace->name != current_trace->name)) |
| 3491 | old_tracer = current_trace; | ||
| 3492 | *iter->trace = *current_trace; | 3524 | *iter->trace = *current_trace; |
| 3493 | } | ||
| 3494 | mutex_unlock(&trace_types_lock); | 3525 | mutex_unlock(&trace_types_lock); |
| 3495 | 3526 | ||
| 3496 | /* | 3527 | /* |
| @@ -3646,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
| 3646 | .ops = &tracing_pipe_buf_ops, | 3677 | .ops = &tracing_pipe_buf_ops, |
| 3647 | .spd_release = tracing_spd_release_pipe, | 3678 | .spd_release = tracing_spd_release_pipe, |
| 3648 | }; | 3679 | }; |
| 3649 | static struct tracer *old_tracer; | ||
| 3650 | ssize_t ret; | 3680 | ssize_t ret; |
| 3651 | size_t rem; | 3681 | size_t rem; |
| 3652 | unsigned int i; | 3682 | unsigned int i; |
| @@ -3656,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
| 3656 | 3686 | ||
| 3657 | /* copy the tracer to avoid using a global lock all around */ | 3687 | /* copy the tracer to avoid using a global lock all around */ |
| 3658 | mutex_lock(&trace_types_lock); | 3688 | mutex_lock(&trace_types_lock); |
| 3659 | if (unlikely(old_tracer != current_trace && current_trace)) { | 3689 | if (unlikely(iter->trace->name != current_trace->name)) |
| 3660 | old_tracer = current_trace; | ||
| 3661 | *iter->trace = *current_trace; | 3690 | *iter->trace = *current_trace; |
| 3662 | } | ||
| 3663 | mutex_unlock(&trace_types_lock); | 3691 | mutex_unlock(&trace_types_lock); |
| 3664 | 3692 | ||
| 3665 | mutex_lock(&iter->mutex); | 3693 | mutex_lock(&iter->mutex); |
| @@ -4035,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
| 4035 | * Reset the buffer so that it doesn't have incomparable timestamps. | 4063 | * Reset the buffer so that it doesn't have incomparable timestamps. |
| 4036 | */ | 4064 | */ |
| 4037 | tracing_reset_online_cpus(&global_trace); | 4065 | tracing_reset_online_cpus(&global_trace); |
| 4038 | if (max_tr.buffer) | 4066 | tracing_reset_online_cpus(&max_tr); |
| 4039 | tracing_reset_online_cpus(&max_tr); | ||
| 4040 | 4067 | ||
| 4041 | mutex_unlock(&trace_types_lock); | 4068 | mutex_unlock(&trace_types_lock); |
| 4042 | 4069 | ||
| @@ -4052,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file) | |||
| 4052 | return single_open(file, tracing_clock_show, NULL); | 4079 | return single_open(file, tracing_clock_show, NULL); |
| 4053 | } | 4080 | } |
| 4054 | 4081 | ||
| 4082 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
| 4083 | static int tracing_snapshot_open(struct inode *inode, struct file *file) | ||
| 4084 | { | ||
| 4085 | struct trace_iterator *iter; | ||
| 4086 | int ret = 0; | ||
| 4087 | |||
| 4088 | if (file->f_mode & FMODE_READ) { | ||
| 4089 | iter = __tracing_open(inode, file, true); | ||
| 4090 | if (IS_ERR(iter)) | ||
| 4091 | ret = PTR_ERR(iter); | ||
| 4092 | } | ||
| 4093 | return ret; | ||
| 4094 | } | ||
| 4095 | |||
| 4096 | static ssize_t | ||
| 4097 | tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, | ||
| 4098 | loff_t *ppos) | ||
| 4099 | { | ||
| 4100 | unsigned long val; | ||
| 4101 | int ret; | ||
| 4102 | |||
| 4103 | ret = tracing_update_buffers(); | ||
| 4104 | if (ret < 0) | ||
| 4105 | return ret; | ||
| 4106 | |||
| 4107 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
| 4108 | if (ret) | ||
| 4109 | return ret; | ||
| 4110 | |||
| 4111 | mutex_lock(&trace_types_lock); | ||
| 4112 | |||
| 4113 | if (current_trace->use_max_tr) { | ||
| 4114 | ret = -EBUSY; | ||
| 4115 | goto out; | ||
| 4116 | } | ||
| 4117 | |||
| 4118 | switch (val) { | ||
| 4119 | case 0: | ||
| 4120 | if (current_trace->allocated_snapshot) { | ||
| 4121 | /* free spare buffer */ | ||
| 4122 | ring_buffer_resize(max_tr.buffer, 1, | ||
| 4123 | RING_BUFFER_ALL_CPUS); | ||
| 4124 | set_buffer_entries(&max_tr, 1); | ||
| 4125 | tracing_reset_online_cpus(&max_tr); | ||
| 4126 | current_trace->allocated_snapshot = false; | ||
| 4127 | } | ||
| 4128 | break; | ||
| 4129 | case 1: | ||
| 4130 | if (!current_trace->allocated_snapshot) { | ||
| 4131 | /* allocate spare buffer */ | ||
| 4132 | ret = resize_buffer_duplicate_size(&max_tr, | ||
| 4133 | &global_trace, RING_BUFFER_ALL_CPUS); | ||
| 4134 | if (ret < 0) | ||
| 4135 | break; | ||
| 4136 | current_trace->allocated_snapshot = true; | ||
| 4137 | } | ||
| 4138 | |||
| 4139 | local_irq_disable(); | ||
| 4140 | /* Now, we're going to swap */ | ||
| 4141 | update_max_tr(&global_trace, current, smp_processor_id()); | ||
| 4142 | local_irq_enable(); | ||
| 4143 | break; | ||
| 4144 | default: | ||
| 4145 | if (current_trace->allocated_snapshot) | ||
| 4146 | tracing_reset_online_cpus(&max_tr); | ||
| 4147 | else | ||
| 4148 | ret = -EINVAL; | ||
| 4149 | break; | ||
| 4150 | } | ||
| 4151 | |||
| 4152 | if (ret >= 0) { | ||
| 4153 | *ppos += cnt; | ||
| 4154 | ret = cnt; | ||
| 4155 | } | ||
| 4156 | out: | ||
| 4157 | mutex_unlock(&trace_types_lock); | ||
| 4158 | return ret; | ||
| 4159 | } | ||
| 4160 | #endif /* CONFIG_TRACER_SNAPSHOT */ | ||
| 4161 | |||
| 4162 | |||
| 4055 | static const struct file_operations tracing_max_lat_fops = { | 4163 | static const struct file_operations tracing_max_lat_fops = { |
| 4056 | .open = tracing_open_generic, | 4164 | .open = tracing_open_generic, |
| 4057 | .read = tracing_max_lat_read, | 4165 | .read = tracing_max_lat_read, |
| @@ -4108,6 +4216,16 @@ static const struct file_operations trace_clock_fops = { | |||
| 4108 | .write = tracing_clock_write, | 4216 | .write = tracing_clock_write, |
| 4109 | }; | 4217 | }; |
| 4110 | 4218 | ||
| 4219 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
| 4220 | static const struct file_operations snapshot_fops = { | ||
| 4221 | .open = tracing_snapshot_open, | ||
| 4222 | .read = seq_read, | ||
| 4223 | .write = tracing_snapshot_write, | ||
| 4224 | .llseek = tracing_seek, | ||
| 4225 | .release = tracing_release, | ||
| 4226 | }; | ||
| 4227 | #endif /* CONFIG_TRACER_SNAPSHOT */ | ||
| 4228 | |||
| 4111 | struct ftrace_buffer_info { | 4229 | struct ftrace_buffer_info { |
| 4112 | struct trace_array *tr; | 4230 | struct trace_array *tr; |
| 4113 | void *spare; | 4231 | void *spare; |
| @@ -4412,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
| 4412 | cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); | 4530 | cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); |
| 4413 | trace_seq_printf(s, "dropped events: %ld\n", cnt); | 4531 | trace_seq_printf(s, "dropped events: %ld\n", cnt); |
| 4414 | 4532 | ||
| 4533 | cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); | ||
| 4534 | trace_seq_printf(s, "read events: %ld\n", cnt); | ||
| 4535 | |||
| 4415 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 4536 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
| 4416 | 4537 | ||
| 4417 | kfree(s); | 4538 | kfree(s); |
| @@ -4488,7 +4609,7 @@ struct dentry *tracing_init_dentry(void) | |||
| 4488 | 4609 | ||
| 4489 | static struct dentry *d_percpu; | 4610 | static struct dentry *d_percpu; |
| 4490 | 4611 | ||
| 4491 | struct dentry *tracing_dentry_percpu(void) | 4612 | static struct dentry *tracing_dentry_percpu(void) |
| 4492 | { | 4613 | { |
| 4493 | static int once; | 4614 | static int once; |
| 4494 | struct dentry *d_tracer; | 4615 | struct dentry *d_tracer; |
| @@ -4815,10 +4936,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
| 4815 | return ret; | 4936 | return ret; |
| 4816 | 4937 | ||
| 4817 | if (buffer) { | 4938 | if (buffer) { |
| 4818 | if (val) | 4939 | mutex_lock(&trace_types_lock); |
| 4940 | if (val) { | ||
| 4819 | ring_buffer_record_on(buffer); | 4941 | ring_buffer_record_on(buffer); |
| 4820 | else | 4942 | if (current_trace->start) |
| 4943 | current_trace->start(tr); | ||
| 4944 | } else { | ||
| 4821 | ring_buffer_record_off(buffer); | 4945 | ring_buffer_record_off(buffer); |
| 4946 | if (current_trace->stop) | ||
| 4947 | current_trace->stop(tr); | ||
| 4948 | } | ||
| 4949 | mutex_unlock(&trace_types_lock); | ||
| 4822 | } | 4950 | } |
| 4823 | 4951 | ||
| 4824 | (*ppos)++; | 4952 | (*ppos)++; |
| @@ -4897,6 +5025,11 @@ static __init int tracer_init_debugfs(void) | |||
| 4897 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 5025 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
| 4898 | #endif | 5026 | #endif |
| 4899 | 5027 | ||
| 5028 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
| 5029 | trace_create_file("snapshot", 0644, d_tracer, | ||
| 5030 | (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); | ||
| 5031 | #endif | ||
| 5032 | |||
| 4900 | create_trace_options_dir(); | 5033 | create_trace_options_dir(); |
| 4901 | 5034 | ||
| 4902 | for_each_tracing_cpu(cpu) | 5035 | for_each_tracing_cpu(cpu) |
| @@ -5005,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
| 5005 | if (disable_tracing) | 5138 | if (disable_tracing) |
| 5006 | ftrace_kill(); | 5139 | ftrace_kill(); |
| 5007 | 5140 | ||
| 5141 | /* Simulate the iterator */ | ||
| 5008 | trace_init_global_iter(&iter); | 5142 | trace_init_global_iter(&iter); |
| 5009 | 5143 | ||
| 5010 | for_each_tracing_cpu(cpu) { | 5144 | for_each_tracing_cpu(cpu) { |
| @@ -5016,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
| 5016 | /* don't look at user memory in panic mode */ | 5150 | /* don't look at user memory in panic mode */ |
| 5017 | trace_flags &= ~TRACE_ITER_SYM_USEROBJ; | 5151 | trace_flags &= ~TRACE_ITER_SYM_USEROBJ; |
| 5018 | 5152 | ||
| 5019 | /* Simulate the iterator */ | ||
| 5020 | iter.tr = &global_trace; | ||
| 5021 | iter.trace = current_trace; | ||
| 5022 | |||
| 5023 | switch (oops_dump_mode) { | 5153 | switch (oops_dump_mode) { |
| 5024 | case DUMP_ALL: | 5154 | case DUMP_ALL: |
| 5025 | iter.cpu_file = TRACE_PIPE_ALL_CPU; | 5155 | iter.cpu_file = TRACE_PIPE_ALL_CPU; |
| @@ -5164,7 +5294,7 @@ __init static int tracer_alloc_buffers(void) | |||
| 5164 | init_irq_work(&trace_work_wakeup, trace_wake_up); | 5294 | init_irq_work(&trace_work_wakeup, trace_wake_up); |
| 5165 | 5295 | ||
| 5166 | register_tracer(&nop_trace); | 5296 | register_tracer(&nop_trace); |
| 5167 | current_trace = &nop_trace; | 5297 | |
| 5168 | /* All seems OK, enable tracing */ | 5298 | /* All seems OK, enable tracing */ |
| 5169 | tracing_disabled = 0; | 5299 | tracing_disabled = 0; |
| 5170 | 5300 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902c..57d7e5397d56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -287,20 +287,62 @@ struct tracer { | |||
| 287 | struct tracer_flags *flags; | 287 | struct tracer_flags *flags; |
| 288 | bool print_max; | 288 | bool print_max; |
| 289 | bool use_max_tr; | 289 | bool use_max_tr; |
| 290 | bool allocated_snapshot; | ||
| 290 | }; | 291 | }; |
| 291 | 292 | ||
| 292 | 293 | ||
| 293 | /* Only current can touch trace_recursion */ | 294 | /* Only current can touch trace_recursion */ |
| 294 | #define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) | ||
| 295 | #define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) | ||
| 296 | 295 | ||
| 297 | /* Ring buffer has the 10 LSB bits to count */ | 296 | /* |
| 298 | #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) | 297 | * For function tracing recursion: |
| 299 | 298 | * The order of these bits are important. | |
| 300 | /* for function tracing recursion */ | 299 | * |
| 301 | #define TRACE_INTERNAL_BIT (1<<11) | 300 | * When function tracing occurs, the following steps are made: |
| 302 | #define TRACE_GLOBAL_BIT (1<<12) | 301 | * If arch does not support a ftrace feature: |
| 303 | #define TRACE_CONTROL_BIT (1<<13) | 302 | * call internal function (uses INTERNAL bits) which calls... |
| 303 | * If callback is registered to the "global" list, the list | ||
| 304 | * function is called and recursion checks the GLOBAL bits. | ||
| 305 | * then this function calls... | ||
| 306 | * The function callback, which can use the FTRACE bits to | ||
| 307 | * check for recursion. | ||
| 308 | * | ||
| 309 | * Now if the arch does not suppport a feature, and it calls | ||
| 310 | * the global list function which calls the ftrace callback | ||
| 311 | * all three of these steps will do a recursion protection. | ||
| 312 | * There's no reason to do one if the previous caller already | ||
| 313 | * did. The recursion that we are protecting against will | ||
| 314 | * go through the same steps again. | ||
| 315 | * | ||
| 316 | * To prevent the multiple recursion checks, if a recursion | ||
| 317 | * bit is set that is higher than the MAX bit of the current | ||
| 318 | * check, then we know that the check was made by the previous | ||
| 319 | * caller, and we can skip the current check. | ||
| 320 | */ | ||
| 321 | enum { | ||
| 322 | TRACE_BUFFER_BIT, | ||
| 323 | TRACE_BUFFER_NMI_BIT, | ||
| 324 | TRACE_BUFFER_IRQ_BIT, | ||
| 325 | TRACE_BUFFER_SIRQ_BIT, | ||
| 326 | |||
| 327 | /* Start of function recursion bits */ | ||
| 328 | TRACE_FTRACE_BIT, | ||
| 329 | TRACE_FTRACE_NMI_BIT, | ||
| 330 | TRACE_FTRACE_IRQ_BIT, | ||
| 331 | TRACE_FTRACE_SIRQ_BIT, | ||
| 332 | |||
| 333 | /* GLOBAL_BITs must be greater than FTRACE_BITs */ | ||
| 334 | TRACE_GLOBAL_BIT, | ||
| 335 | TRACE_GLOBAL_NMI_BIT, | ||
| 336 | TRACE_GLOBAL_IRQ_BIT, | ||
| 337 | TRACE_GLOBAL_SIRQ_BIT, | ||
| 338 | |||
| 339 | /* INTERNAL_BITs must be greater than GLOBAL_BITs */ | ||
| 340 | TRACE_INTERNAL_BIT, | ||
| 341 | TRACE_INTERNAL_NMI_BIT, | ||
| 342 | TRACE_INTERNAL_IRQ_BIT, | ||
| 343 | TRACE_INTERNAL_SIRQ_BIT, | ||
| 344 | |||
| 345 | TRACE_CONTROL_BIT, | ||
| 304 | 346 | ||
| 305 | /* | 347 | /* |
| 306 | * Abuse of the trace_recursion. | 348 | * Abuse of the trace_recursion. |
| @@ -309,11 +351,77 @@ struct tracer { | |||
| 309 | * was called in irq context but we have irq tracing off. Since this | 351 | * was called in irq context but we have irq tracing off. Since this |
| 310 | * can only be modified by current, we can reuse trace_recursion. | 352 | * can only be modified by current, we can reuse trace_recursion. |
| 311 | */ | 353 | */ |
| 312 | #define TRACE_IRQ_BIT (1<<13) | 354 | TRACE_IRQ_BIT, |
| 355 | }; | ||
| 356 | |||
| 357 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) | ||
| 358 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) | ||
| 359 | #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) | ||
| 360 | |||
| 361 | #define TRACE_CONTEXT_BITS 4 | ||
| 362 | |||
| 363 | #define TRACE_FTRACE_START TRACE_FTRACE_BIT | ||
| 364 | #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) | ||
| 365 | |||
| 366 | #define TRACE_GLOBAL_START TRACE_GLOBAL_BIT | ||
| 367 | #define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) | ||
| 368 | |||
| 369 | #define TRACE_LIST_START TRACE_INTERNAL_BIT | ||
| 370 | #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) | ||
| 371 | |||
| 372 | #define TRACE_CONTEXT_MASK TRACE_LIST_MAX | ||
| 373 | |||
| 374 | static __always_inline int trace_get_context_bit(void) | ||
| 375 | { | ||
| 376 | int bit; | ||
| 313 | 377 | ||
| 314 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) | 378 | if (in_interrupt()) { |
| 315 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) | 379 | if (in_nmi()) |
| 316 | #define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) | 380 | bit = 0; |
| 381 | |||
| 382 | else if (in_irq()) | ||
| 383 | bit = 1; | ||
| 384 | else | ||
| 385 | bit = 2; | ||
| 386 | } else | ||
| 387 | bit = 3; | ||
| 388 | |||
| 389 | return bit; | ||
| 390 | } | ||
| 391 | |||
| 392 | static __always_inline int trace_test_and_set_recursion(int start, int max) | ||
| 393 | { | ||
| 394 | unsigned int val = current->trace_recursion; | ||
| 395 | int bit; | ||
| 396 | |||
| 397 | /* A previous recursion check was made */ | ||
| 398 | if ((val & TRACE_CONTEXT_MASK) > max) | ||
| 399 | return 0; | ||
| 400 | |||
| 401 | bit = trace_get_context_bit() + start; | ||
| 402 | if (unlikely(val & (1 << bit))) | ||
| 403 | return -1; | ||
| 404 | |||
| 405 | val |= 1 << bit; | ||
| 406 | current->trace_recursion = val; | ||
| 407 | barrier(); | ||
| 408 | |||
| 409 | return bit; | ||
| 410 | } | ||
| 411 | |||
| 412 | static __always_inline void trace_clear_recursion(int bit) | ||
| 413 | { | ||
| 414 | unsigned int val = current->trace_recursion; | ||
| 415 | |||
| 416 | if (!bit) | ||
| 417 | return; | ||
| 418 | |||
| 419 | bit = 1 << bit; | ||
| 420 | val &= ~bit; | ||
| 421 | |||
| 422 | barrier(); | ||
| 423 | current->trace_recursion = val; | ||
| 424 | } | ||
| 317 | 425 | ||
| 318 | #define TRACE_PIPE_ALL_CPU -1 | 426 | #define TRACE_PIPE_ALL_CPU -1 |
| 319 | 427 | ||
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cbb..aa8f5f48dae6 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
| @@ -21,8 +21,6 @@ | |||
| 21 | #include <linux/ktime.h> | 21 | #include <linux/ktime.h> |
| 22 | #include <linux/trace_clock.h> | 22 | #include <linux/trace_clock.h> |
| 23 | 23 | ||
| 24 | #include "trace.h" | ||
| 25 | |||
| 26 | /* | 24 | /* |
| 27 | * trace_clock_local(): the simplest and least coherent tracing clock. | 25 | * trace_clock_local(): the simplest and least coherent tracing clock. |
| 28 | * | 26 | * |
| @@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void) | |||
| 44 | 42 | ||
| 45 | return clock; | 43 | return clock; |
| 46 | } | 44 | } |
| 45 | EXPORT_SYMBOL_GPL(trace_clock_local); | ||
| 47 | 46 | ||
| 48 | /* | 47 | /* |
| 49 | * trace_clock(): 'between' trace clock. Not completely serialized, | 48 | * trace_clock(): 'between' trace clock. Not completely serialized, |
| @@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void) | |||
| 86 | local_irq_save(flags); | 85 | local_irq_save(flags); |
| 87 | 86 | ||
| 88 | this_cpu = raw_smp_processor_id(); | 87 | this_cpu = raw_smp_processor_id(); |
| 89 | now = cpu_clock(this_cpu); | 88 | now = sched_clock_cpu(this_cpu); |
| 90 | /* | 89 | /* |
| 91 | * If in an NMI context then dont risk lockups and return the | 90 | * If in an NMI context then dont risk lockups and return the |
| 92 | * cpu_clock() time: | 91 | * cpu_clock() time: |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b946..57e9b284250c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) | |||
| 116 | __common_field(unsigned char, flags); | 116 | __common_field(unsigned char, flags); |
| 117 | __common_field(unsigned char, preempt_count); | 117 | __common_field(unsigned char, preempt_count); |
| 118 | __common_field(int, pid); | 118 | __common_field(int, pid); |
| 119 | __common_field(int, padding); | ||
| 120 | 119 | ||
| 121 | return ret; | 120 | return ret; |
| 122 | } | 121 | } |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab7..601152523326 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr) | |||
| 47 | tracing_reset_online_cpus(tr); | 47 | tracing_reset_online_cpus(tr); |
| 48 | } | 48 | } |
| 49 | 49 | ||
| 50 | static void | ||
| 51 | function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, | ||
| 52 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 53 | { | ||
| 54 | struct trace_array *tr = func_trace; | ||
| 55 | struct trace_array_cpu *data; | ||
| 56 | unsigned long flags; | ||
| 57 | long disabled; | ||
| 58 | int cpu; | ||
| 59 | int pc; | ||
| 60 | |||
| 61 | if (unlikely(!ftrace_function_enabled)) | ||
| 62 | return; | ||
| 63 | |||
| 64 | pc = preempt_count(); | ||
| 65 | preempt_disable_notrace(); | ||
| 66 | local_save_flags(flags); | ||
| 67 | cpu = raw_smp_processor_id(); | ||
| 68 | data = tr->data[cpu]; | ||
| 69 | disabled = atomic_inc_return(&data->disabled); | ||
| 70 | |||
| 71 | if (likely(disabled == 1)) | ||
| 72 | trace_function(tr, ip, parent_ip, flags, pc); | ||
| 73 | |||
| 74 | atomic_dec(&data->disabled); | ||
| 75 | preempt_enable_notrace(); | ||
| 76 | } | ||
| 77 | |||
| 78 | /* Our option */ | 50 | /* Our option */ |
| 79 | enum { | 51 | enum { |
| 80 | TRACE_FUNC_OPT_STACK = 0x1, | 52 | TRACE_FUNC_OPT_STACK = 0x1, |
| @@ -85,34 +57,34 @@ static struct tracer_flags func_flags; | |||
| 85 | static void | 57 | static void |
| 86 | function_trace_call(unsigned long ip, unsigned long parent_ip, | 58 | function_trace_call(unsigned long ip, unsigned long parent_ip, |
| 87 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 59 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
| 88 | |||
| 89 | { | 60 | { |
| 90 | struct trace_array *tr = func_trace; | 61 | struct trace_array *tr = func_trace; |
| 91 | struct trace_array_cpu *data; | 62 | struct trace_array_cpu *data; |
| 92 | unsigned long flags; | 63 | unsigned long flags; |
| 93 | long disabled; | 64 | int bit; |
| 94 | int cpu; | 65 | int cpu; |
| 95 | int pc; | 66 | int pc; |
| 96 | 67 | ||
| 97 | if (unlikely(!ftrace_function_enabled)) | 68 | if (unlikely(!ftrace_function_enabled)) |
| 98 | return; | 69 | return; |
| 99 | 70 | ||
| 100 | /* | 71 | pc = preempt_count(); |
| 101 | * Need to use raw, since this must be called before the | 72 | preempt_disable_notrace(); |
| 102 | * recursive protection is performed. | ||
| 103 | */ | ||
| 104 | local_irq_save(flags); | ||
| 105 | cpu = raw_smp_processor_id(); | ||
| 106 | data = tr->data[cpu]; | ||
| 107 | disabled = atomic_inc_return(&data->disabled); | ||
| 108 | 73 | ||
| 109 | if (likely(disabled == 1)) { | 74 | bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); |
| 110 | pc = preempt_count(); | 75 | if (bit < 0) |
| 76 | goto out; | ||
| 77 | |||
| 78 | cpu = smp_processor_id(); | ||
| 79 | data = tr->data[cpu]; | ||
| 80 | if (!atomic_read(&data->disabled)) { | ||
| 81 | local_save_flags(flags); | ||
| 111 | trace_function(tr, ip, parent_ip, flags, pc); | 82 | trace_function(tr, ip, parent_ip, flags, pc); |
| 112 | } | 83 | } |
| 84 | trace_clear_recursion(bit); | ||
| 113 | 85 | ||
| 114 | atomic_dec(&data->disabled); | 86 | out: |
| 115 | local_irq_restore(flags); | 87 | preempt_enable_notrace(); |
| 116 | } | 88 | } |
| 117 | 89 | ||
| 118 | static void | 90 | static void |
| @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void) | |||
| 185 | { | 157 | { |
| 186 | ftrace_function_enabled = 0; | 158 | ftrace_function_enabled = 0; |
| 187 | 159 | ||
| 188 | if (trace_flags & TRACE_ITER_PREEMPTONLY) | ||
| 189 | trace_ops.func = function_trace_call_preempt_only; | ||
| 190 | else | ||
| 191 | trace_ops.func = function_trace_call; | ||
| 192 | |||
| 193 | if (func_flags.val & TRACE_FUNC_OPT_STACK) | 160 | if (func_flags.val & TRACE_FUNC_OPT_STACK) |
| 194 | register_ftrace_function(&trace_stack_ops); | 161 | register_ftrace_function(&trace_stack_ops); |
| 195 | else | 162 | else |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7e..39ada66389cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -47,6 +47,8 @@ struct fgraph_data { | |||
| 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
| 48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | 48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 |
| 49 | 49 | ||
| 50 | static unsigned int max_depth; | ||
| 51 | |||
| 50 | static struct tracer_opt trace_opts[] = { | 52 | static struct tracer_opt trace_opts[] = { |
| 51 | /* Display overruns? (for self-debug purpose) */ | 53 | /* Display overruns? (for self-debug purpose) */ |
| 52 | { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, | 54 | { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, |
| @@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) | |||
| 189 | 191 | ||
| 190 | ftrace_pop_return_trace(&trace, &ret, frame_pointer); | 192 | ftrace_pop_return_trace(&trace, &ret, frame_pointer); |
| 191 | trace.rettime = trace_clock_local(); | 193 | trace.rettime = trace_clock_local(); |
| 192 | ftrace_graph_return(&trace); | ||
| 193 | barrier(); | 194 | barrier(); |
| 194 | current->curr_ret_stack--; | 195 | current->curr_ret_stack--; |
| 195 | 196 | ||
| 197 | /* | ||
| 198 | * The trace should run after decrementing the ret counter | ||
| 199 | * in case an interrupt were to come in. We don't want to | ||
| 200 | * lose the interrupt if max_depth is set. | ||
| 201 | */ | ||
| 202 | ftrace_graph_return(&trace); | ||
| 203 | |||
| 196 | if (unlikely(!ret)) { | 204 | if (unlikely(!ret)) { |
| 197 | ftrace_graph_stop(); | 205 | ftrace_graph_stop(); |
| 198 | WARN_ON(1); | 206 | WARN_ON(1); |
| @@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 250 | return 0; | 258 | return 0; |
| 251 | 259 | ||
| 252 | /* trace it when it is-nested-in or is a function enabled. */ | 260 | /* trace it when it is-nested-in or is a function enabled. */ |
| 253 | if (!(trace->depth || ftrace_graph_addr(trace->func)) || | 261 | if ((!(trace->depth || ftrace_graph_addr(trace->func)) || |
| 254 | ftrace_graph_ignore_irqs()) | 262 | ftrace_graph_ignore_irqs()) || |
| 263 | (max_depth && trace->depth >= max_depth)) | ||
| 255 | return 0; | 264 | return 0; |
| 256 | 265 | ||
| 257 | local_irq_save(flags); | 266 | local_irq_save(flags); |
| @@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = { | |||
| 1457 | #endif | 1466 | #endif |
| 1458 | }; | 1467 | }; |
| 1459 | 1468 | ||
| 1469 | |||
| 1470 | static ssize_t | ||
| 1471 | graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, | ||
| 1472 | loff_t *ppos) | ||
| 1473 | { | ||
| 1474 | unsigned long val; | ||
| 1475 | int ret; | ||
| 1476 | |||
| 1477 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
| 1478 | if (ret) | ||
| 1479 | return ret; | ||
| 1480 | |||
| 1481 | max_depth = val; | ||
| 1482 | |||
| 1483 | *ppos += cnt; | ||
| 1484 | |||
| 1485 | return cnt; | ||
| 1486 | } | ||
| 1487 | |||
| 1488 | static ssize_t | ||
| 1489 | graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, | ||
| 1490 | loff_t *ppos) | ||
| 1491 | { | ||
| 1492 | char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ | ||
| 1493 | int n; | ||
| 1494 | |||
| 1495 | n = sprintf(buf, "%d\n", max_depth); | ||
| 1496 | |||
| 1497 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | static const struct file_operations graph_depth_fops = { | ||
| 1501 | .open = tracing_open_generic, | ||
| 1502 | .write = graph_depth_write, | ||
| 1503 | .read = graph_depth_read, | ||
| 1504 | .llseek = generic_file_llseek, | ||
| 1505 | }; | ||
| 1506 | |||
| 1507 | static __init int init_graph_debugfs(void) | ||
| 1508 | { | ||
| 1509 | struct dentry *d_tracer; | ||
| 1510 | |||
| 1511 | d_tracer = tracing_init_dentry(); | ||
| 1512 | if (!d_tracer) | ||
| 1513 | return 0; | ||
| 1514 | |||
| 1515 | trace_create_file("max_graph_depth", 0644, d_tracer, | ||
| 1516 | NULL, &graph_depth_fops); | ||
| 1517 | |||
| 1518 | return 0; | ||
| 1519 | } | ||
| 1520 | fs_initcall(init_graph_debugfs); | ||
| 1521 | |||
| 1460 | static __init int init_graph_trace(void) | 1522 | static __init int init_graph_trace(void) |
| 1461 | { | 1523 | { |
| 1462 | max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); | 1524 | max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 194d79602dc7..697e88d13907 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -739,12 +739,11 @@ static int task_state_char(unsigned long state) | |||
| 739 | struct trace_event *ftrace_find_event(int type) | 739 | struct trace_event *ftrace_find_event(int type) |
| 740 | { | 740 | { |
| 741 | struct trace_event *event; | 741 | struct trace_event *event; |
| 742 | struct hlist_node *n; | ||
| 743 | unsigned key; | 742 | unsigned key; |
| 744 | 743 | ||
| 745 | key = type & (EVENT_HASHSIZE - 1); | 744 | key = type & (EVENT_HASHSIZE - 1); |
| 746 | 745 | ||
| 747 | hlist_for_each_entry(event, n, &event_hash[key], node) { | 746 | hlist_for_each_entry(event, &event_hash[key], node) { |
| 748 | if (event->type == type) | 747 | if (event->type == type) |
| 749 | return event; | 748 | return event; |
| 750 | } | 749 | } |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 933708677814..5c7e09d10d74 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -66,7 +66,6 @@ | |||
| 66 | #define TP_FLAG_TRACE 1 | 66 | #define TP_FLAG_TRACE 1 |
| 67 | #define TP_FLAG_PROFILE 2 | 67 | #define TP_FLAG_PROFILE 2 |
| 68 | #define TP_FLAG_REGISTERED 4 | 68 | #define TP_FLAG_REGISTERED 4 |
| 69 | #define TP_FLAG_UPROBE 8 | ||
| 70 | 69 | ||
| 71 | 70 | ||
| 72 | /* data_rloc: data relative location, compatible with u32 */ | 71 | /* data_rloc: data relative location, compatible with u32 */ |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9fe45fcefca0..75aa97fbe1a1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -15,8 +15,8 @@ | |||
| 15 | #include <linux/kallsyms.h> | 15 | #include <linux/kallsyms.h> |
| 16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
| 17 | #include <linux/ftrace.h> | 17 | #include <linux/ftrace.h> |
| 18 | #include <linux/sched/rt.h> | ||
| 18 | #include <trace/events/sched.h> | 19 | #include <trace/events/sched.h> |
| 19 | |||
| 20 | #include "trace.h" | 20 | #include "trace.h" |
| 21 | 21 | ||
| 22 | static struct trace_array *wakeup_trace; | 22 | static struct trace_array *wakeup_trace; |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a815..51c819c12c29 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip, | |||
| 415 | * The ftrace infrastructure should provide the recursion | 415 | * The ftrace infrastructure should provide the recursion |
| 416 | * protection. If not, this will crash the kernel! | 416 | * protection. If not, this will crash the kernel! |
| 417 | */ | 417 | */ |
| 418 | trace_selftest_recursion_cnt++; | 418 | if (trace_selftest_recursion_cnt++ > 10) |
| 419 | return; | ||
| 419 | DYN_FTRACE_TEST_NAME(); | 420 | DYN_FTRACE_TEST_NAME(); |
| 420 | } | 421 | } |
| 421 | 422 | ||
| @@ -452,7 +453,6 @@ trace_selftest_function_recursion(void) | |||
| 452 | char *func_name; | 453 | char *func_name; |
| 453 | int len; | 454 | int len; |
| 454 | int ret; | 455 | int ret; |
| 455 | int cnt; | ||
| 456 | 456 | ||
| 457 | /* The previous test PASSED */ | 457 | /* The previous test PASSED */ |
| 458 | pr_cont("PASSED\n"); | 458 | pr_cont("PASSED\n"); |
| @@ -510,19 +510,10 @@ trace_selftest_function_recursion(void) | |||
| 510 | 510 | ||
| 511 | unregister_ftrace_function(&test_recsafe_probe); | 511 | unregister_ftrace_function(&test_recsafe_probe); |
| 512 | 512 | ||
| 513 | /* | ||
| 514 | * If arch supports all ftrace features, and no other task | ||
| 515 | * was on the list, we should be fine. | ||
| 516 | */ | ||
| 517 | if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) | ||
| 518 | cnt = 2; /* Should have recursed */ | ||
| 519 | else | ||
| 520 | cnt = 1; | ||
| 521 | |||
| 522 | ret = -1; | 513 | ret = -1; |
| 523 | if (trace_selftest_recursion_cnt != cnt) { | 514 | if (trace_selftest_recursion_cnt != 2) { |
| 524 | pr_cont("*callback not called expected %d times (%d)* ", | 515 | pr_cont("*callback not called expected 2 times (%d)* ", |
| 525 | cnt, trace_selftest_recursion_cnt); | 516 | trace_selftest_recursion_cnt); |
| 526 | goto out; | 517 | goto out; |
| 527 | } | 518 | } |
| 528 | 519 | ||
| @@ -568,7 +559,7 @@ trace_selftest_function_regs(void) | |||
| 568 | int ret; | 559 | int ret; |
| 569 | int supported = 0; | 560 | int supported = 0; |
| 570 | 561 | ||
| 571 | #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS | 562 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS |
| 572 | supported = 1; | 563 | supported = 1; |
| 573 | #endif | 564 | #endif |
| 574 | 565 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c2..7a809e321058 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | #include <trace/syscall.h> | 1 | #include <trace/syscall.h> |
| 2 | #include <trace/events/syscalls.h> | 2 | #include <trace/events/syscalls.h> |
| 3 | #include <linux/syscalls.h> | ||
| 3 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
| 4 | #include <linux/kernel.h> | 5 | #include <linux/kernel.h> |
| 5 | #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ | 6 | #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ |
| @@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name | |||
| 47 | } | 48 | } |
| 48 | #endif | 49 | #endif |
| 49 | 50 | ||
| 51 | #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS | ||
| 52 | /* | ||
| 53 | * Some architectures that allow for 32bit applications | ||
| 54 | * to run on a 64bit kernel, do not map the syscalls for | ||
| 55 | * the 32bit tasks the same as they do for 64bit tasks. | ||
| 56 | * | ||
| 57 | * *cough*x86*cough* | ||
| 58 | * | ||
| 59 | * In such a case, instead of reporting the wrong syscalls, | ||
| 60 | * simply ignore them. | ||
| 61 | * | ||
| 62 | * For an arch to ignore the compat syscalls it needs to | ||
| 63 | * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as | ||
| 64 | * define the function arch_trace_is_compat_syscall() to let | ||
| 65 | * the tracing system know that it should ignore it. | ||
| 66 | */ | ||
| 67 | static int | ||
| 68 | trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) | ||
| 69 | { | ||
| 70 | if (unlikely(arch_trace_is_compat_syscall(regs))) | ||
| 71 | return -1; | ||
| 72 | |||
| 73 | return syscall_get_nr(task, regs); | ||
| 74 | } | ||
| 75 | #else | ||
| 76 | static inline int | ||
| 77 | trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) | ||
| 78 | { | ||
| 79 | return syscall_get_nr(task, regs); | ||
| 80 | } | ||
| 81 | #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ | ||
| 82 | |||
| 50 | static __init struct syscall_metadata * | 83 | static __init struct syscall_metadata * |
| 51 | find_syscall_meta(unsigned long syscall) | 84 | find_syscall_meta(unsigned long syscall) |
| 52 | { | 85 | { |
| @@ -77,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) | |||
| 77 | return syscalls_metadata[nr]; | 110 | return syscalls_metadata[nr]; |
| 78 | } | 111 | } |
| 79 | 112 | ||
| 80 | enum print_line_t | 113 | static enum print_line_t |
| 81 | print_syscall_enter(struct trace_iterator *iter, int flags, | 114 | print_syscall_enter(struct trace_iterator *iter, int flags, |
| 82 | struct trace_event *event) | 115 | struct trace_event *event) |
| 83 | { | 116 | { |
| @@ -130,7 +163,7 @@ end: | |||
| 130 | return TRACE_TYPE_HANDLED; | 163 | return TRACE_TYPE_HANDLED; |
| 131 | } | 164 | } |
| 132 | 165 | ||
| 133 | enum print_line_t | 166 | static enum print_line_t |
| 134 | print_syscall_exit(struct trace_iterator *iter, int flags, | 167 | print_syscall_exit(struct trace_iterator *iter, int flags, |
| 135 | struct trace_event *event) | 168 | struct trace_event *event) |
| 136 | { | 169 | { |
| @@ -270,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) | |||
| 270 | return ret; | 303 | return ret; |
| 271 | } | 304 | } |
| 272 | 305 | ||
| 273 | void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) | 306 | static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) |
| 274 | { | 307 | { |
| 275 | struct syscall_trace_enter *entry; | 308 | struct syscall_trace_enter *entry; |
| 276 | struct syscall_metadata *sys_data; | 309 | struct syscall_metadata *sys_data; |
| 277 | struct ring_buffer_event *event; | 310 | struct ring_buffer_event *event; |
| 278 | struct ring_buffer *buffer; | 311 | struct ring_buffer *buffer; |
| 279 | int size; | ||
| 280 | int syscall_nr; | 312 | int syscall_nr; |
| 313 | int size; | ||
| 281 | 314 | ||
| 282 | syscall_nr = syscall_get_nr(current, regs); | 315 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 283 | if (syscall_nr < 0) | 316 | if (syscall_nr < 0) |
| 284 | return; | 317 | return; |
| 285 | if (!test_bit(syscall_nr, enabled_enter_syscalls)) | 318 | if (!test_bit(syscall_nr, enabled_enter_syscalls)) |
| @@ -305,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 305 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 338 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); |
| 306 | } | 339 | } |
| 307 | 340 | ||
| 308 | void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | 341 | static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) |
| 309 | { | 342 | { |
| 310 | struct syscall_trace_exit *entry; | 343 | struct syscall_trace_exit *entry; |
| 311 | struct syscall_metadata *sys_data; | 344 | struct syscall_metadata *sys_data; |
| @@ -313,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 313 | struct ring_buffer *buffer; | 346 | struct ring_buffer *buffer; |
| 314 | int syscall_nr; | 347 | int syscall_nr; |
| 315 | 348 | ||
| 316 | syscall_nr = syscall_get_nr(current, regs); | 349 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 317 | if (syscall_nr < 0) | 350 | if (syscall_nr < 0) |
| 318 | return; | 351 | return; |
| 319 | if (!test_bit(syscall_nr, enabled_exit_syscalls)) | 352 | if (!test_bit(syscall_nr, enabled_exit_syscalls)) |
| @@ -337,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 337 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 370 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); |
| 338 | } | 371 | } |
| 339 | 372 | ||
| 340 | int reg_event_syscall_enter(struct ftrace_event_call *call) | 373 | static int reg_event_syscall_enter(struct ftrace_event_call *call) |
| 341 | { | 374 | { |
| 342 | int ret = 0; | 375 | int ret = 0; |
| 343 | int num; | 376 | int num; |
| @@ -356,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) | |||
| 356 | return ret; | 389 | return ret; |
| 357 | } | 390 | } |
| 358 | 391 | ||
| 359 | void unreg_event_syscall_enter(struct ftrace_event_call *call) | 392 | static void unreg_event_syscall_enter(struct ftrace_event_call *call) |
| 360 | { | 393 | { |
| 361 | int num; | 394 | int num; |
| 362 | 395 | ||
| @@ -371,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) | |||
| 371 | mutex_unlock(&syscall_trace_lock); | 404 | mutex_unlock(&syscall_trace_lock); |
| 372 | } | 405 | } |
| 373 | 406 | ||
| 374 | int reg_event_syscall_exit(struct ftrace_event_call *call) | 407 | static int reg_event_syscall_exit(struct ftrace_event_call *call) |
| 375 | { | 408 | { |
| 376 | int ret = 0; | 409 | int ret = 0; |
| 377 | int num; | 410 | int num; |
| @@ -390,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) | |||
| 390 | return ret; | 423 | return ret; |
| 391 | } | 424 | } |
| 392 | 425 | ||
| 393 | void unreg_event_syscall_exit(struct ftrace_event_call *call) | 426 | static void unreg_event_syscall_exit(struct ftrace_event_call *call) |
| 394 | { | 427 | { |
| 395 | int num; | 428 | int num; |
| 396 | 429 | ||
| @@ -459,7 +492,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) | |||
| 459 | return (unsigned long)sys_call_table[nr]; | 492 | return (unsigned long)sys_call_table[nr]; |
| 460 | } | 493 | } |
| 461 | 494 | ||
| 462 | int __init init_ftrace_syscalls(void) | 495 | static int __init init_ftrace_syscalls(void) |
| 463 | { | 496 | { |
| 464 | struct syscall_metadata *meta; | 497 | struct syscall_metadata *meta; |
| 465 | unsigned long addr; | 498 | unsigned long addr; |
| @@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 502 | int rctx; | 535 | int rctx; |
| 503 | int size; | 536 | int size; |
| 504 | 537 | ||
| 505 | syscall_nr = syscall_get_nr(current, regs); | 538 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 506 | if (syscall_nr < 0) | 539 | if (syscall_nr < 0) |
| 507 | return; | 540 | return; |
| 508 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) | 541 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) |
| @@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 578 | int rctx; | 611 | int rctx; |
| 579 | int size; | 612 | int size; |
| 580 | 613 | ||
| 581 | syscall_nr = syscall_get_nr(current, regs); | 614 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 582 | if (syscall_nr < 0) | 615 | if (syscall_nr < 0) |
| 583 | return; | 616 | return; |
| 584 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) | 617 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67fb..8dad2a92dee9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -28,20 +28,21 @@ | |||
| 28 | 28 | ||
| 29 | #define UPROBE_EVENT_SYSTEM "uprobes" | 29 | #define UPROBE_EVENT_SYSTEM "uprobes" |
| 30 | 30 | ||
| 31 | struct trace_uprobe_filter { | ||
| 32 | rwlock_t rwlock; | ||
| 33 | int nr_systemwide; | ||
| 34 | struct list_head perf_events; | ||
| 35 | }; | ||
| 36 | |||
| 31 | /* | 37 | /* |
| 32 | * uprobe event core functions | 38 | * uprobe event core functions |
| 33 | */ | 39 | */ |
| 34 | struct trace_uprobe; | ||
| 35 | struct uprobe_trace_consumer { | ||
| 36 | struct uprobe_consumer cons; | ||
| 37 | struct trace_uprobe *tu; | ||
| 38 | }; | ||
| 39 | |||
| 40 | struct trace_uprobe { | 40 | struct trace_uprobe { |
| 41 | struct list_head list; | 41 | struct list_head list; |
| 42 | struct ftrace_event_class class; | 42 | struct ftrace_event_class class; |
| 43 | struct ftrace_event_call call; | 43 | struct ftrace_event_call call; |
| 44 | struct uprobe_trace_consumer *consumer; | 44 | struct trace_uprobe_filter filter; |
| 45 | struct uprobe_consumer consumer; | ||
| 45 | struct inode *inode; | 46 | struct inode *inode; |
| 46 | char *filename; | 47 | char *filename; |
| 47 | unsigned long offset; | 48 | unsigned long offset; |
| @@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list); | |||
| 64 | 65 | ||
| 65 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); | 66 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); |
| 66 | 67 | ||
| 68 | static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) | ||
| 69 | { | ||
| 70 | rwlock_init(&filter->rwlock); | ||
| 71 | filter->nr_systemwide = 0; | ||
| 72 | INIT_LIST_HEAD(&filter->perf_events); | ||
| 73 | } | ||
| 74 | |||
| 75 | static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) | ||
| 76 | { | ||
| 77 | return !filter->nr_systemwide && list_empty(&filter->perf_events); | ||
| 78 | } | ||
| 79 | |||
| 67 | /* | 80 | /* |
| 68 | * Allocate new trace_uprobe and initialize it (including uprobes). | 81 | * Allocate new trace_uprobe and initialize it (including uprobes). |
| 69 | */ | 82 | */ |
| @@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) | |||
| 92 | goto error; | 105 | goto error; |
| 93 | 106 | ||
| 94 | INIT_LIST_HEAD(&tu->list); | 107 | INIT_LIST_HEAD(&tu->list); |
| 108 | tu->consumer.handler = uprobe_dispatcher; | ||
| 109 | init_trace_uprobe_filter(&tu->filter); | ||
| 95 | return tu; | 110 | return tu; |
| 96 | 111 | ||
| 97 | error: | 112 | error: |
| @@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv) | |||
| 253 | if (ret) | 268 | if (ret) |
| 254 | goto fail_address_parse; | 269 | goto fail_address_parse; |
| 255 | 270 | ||
| 271 | inode = igrab(path.dentry->d_inode); | ||
| 272 | path_put(&path); | ||
| 273 | |||
| 274 | if (!inode || !S_ISREG(inode->i_mode)) { | ||
| 275 | ret = -EINVAL; | ||
| 276 | goto fail_address_parse; | ||
| 277 | } | ||
| 278 | |||
| 256 | ret = kstrtoul(arg, 0, &offset); | 279 | ret = kstrtoul(arg, 0, &offset); |
| 257 | if (ret) | 280 | if (ret) |
| 258 | goto fail_address_parse; | 281 | goto fail_address_parse; |
| 259 | 282 | ||
| 260 | inode = igrab(path.dentry->d_inode); | ||
| 261 | |||
| 262 | argc -= 2; | 283 | argc -= 2; |
| 263 | argv += 2; | 284 | argv += 2; |
| 264 | 285 | ||
| @@ -356,7 +377,7 @@ fail_address_parse: | |||
| 356 | if (inode) | 377 | if (inode) |
| 357 | iput(inode); | 378 | iput(inode); |
| 358 | 379 | ||
| 359 | pr_info("Failed to parse address.\n"); | 380 | pr_info("Failed to parse address or file.\n"); |
| 360 | 381 | ||
| 361 | return ret; | 382 | return ret; |
| 362 | } | 383 | } |
| @@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { | |||
| 465 | }; | 486 | }; |
| 466 | 487 | ||
| 467 | /* uprobe handler */ | 488 | /* uprobe handler */ |
| 468 | static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | 489 | static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) |
| 469 | { | 490 | { |
| 470 | struct uprobe_trace_entry_head *entry; | 491 | struct uprobe_trace_entry_head *entry; |
| 471 | struct ring_buffer_event *event; | 492 | struct ring_buffer_event *event; |
| @@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | |||
| 475 | unsigned long irq_flags; | 496 | unsigned long irq_flags; |
| 476 | struct ftrace_event_call *call = &tu->call; | 497 | struct ftrace_event_call *call = &tu->call; |
| 477 | 498 | ||
| 478 | tu->nhit++; | ||
| 479 | |||
| 480 | local_save_flags(irq_flags); | 499 | local_save_flags(irq_flags); |
| 481 | pc = preempt_count(); | 500 | pc = preempt_count(); |
| 482 | 501 | ||
| @@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | |||
| 485 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 504 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
| 486 | size, irq_flags, pc); | 505 | size, irq_flags, pc); |
| 487 | if (!event) | 506 | if (!event) |
| 488 | return; | 507 | return 0; |
| 489 | 508 | ||
| 490 | entry = ring_buffer_event_data(event); | 509 | entry = ring_buffer_event_data(event); |
| 491 | entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); | 510 | entry->ip = instruction_pointer(task_pt_regs(current)); |
| 492 | data = (u8 *)&entry[1]; | 511 | data = (u8 *)&entry[1]; |
| 493 | for (i = 0; i < tu->nr_args; i++) | 512 | for (i = 0; i < tu->nr_args; i++) |
| 494 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 513 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); |
| 495 | 514 | ||
| 496 | if (!filter_current_check_discard(buffer, call, entry, event)) | 515 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 497 | trace_buffer_unlock_commit(buffer, event, irq_flags, pc); | 516 | trace_buffer_unlock_commit(buffer, event, irq_flags, pc); |
| 517 | |||
| 518 | return 0; | ||
| 498 | } | 519 | } |
| 499 | 520 | ||
| 500 | /* Event entry printers */ | 521 | /* Event entry printers */ |
| @@ -533,42 +554,43 @@ partial: | |||
| 533 | return TRACE_TYPE_PARTIAL_LINE; | 554 | return TRACE_TYPE_PARTIAL_LINE; |
| 534 | } | 555 | } |
| 535 | 556 | ||
| 536 | static int probe_event_enable(struct trace_uprobe *tu, int flag) | 557 | static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) |
| 537 | { | 558 | { |
| 538 | struct uprobe_trace_consumer *utc; | 559 | return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); |
| 539 | int ret = 0; | 560 | } |
| 540 | 561 | ||
| 541 | if (!tu->inode || tu->consumer) | 562 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, |
| 542 | return -EINTR; | 563 | enum uprobe_filter_ctx ctx, |
| 564 | struct mm_struct *mm); | ||
| 543 | 565 | ||
| 544 | utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); | 566 | static int |
| 545 | if (!utc) | 567 | probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) |
| 568 | { | ||
| 569 | int ret = 0; | ||
| 570 | |||
| 571 | if (is_trace_uprobe_enabled(tu)) | ||
| 546 | return -EINTR; | 572 | return -EINTR; |
| 547 | 573 | ||
| 548 | utc->cons.handler = uprobe_dispatcher; | 574 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); |
| 549 | utc->cons.filter = NULL; | ||
| 550 | ret = uprobe_register(tu->inode, tu->offset, &utc->cons); | ||
| 551 | if (ret) { | ||
| 552 | kfree(utc); | ||
| 553 | return ret; | ||
| 554 | } | ||
| 555 | 575 | ||
| 556 | tu->flags |= flag; | 576 | tu->flags |= flag; |
| 557 | utc->tu = tu; | 577 | tu->consumer.filter = filter; |
| 558 | tu->consumer = utc; | 578 | ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); |
| 579 | if (ret) | ||
| 580 | tu->flags &= ~flag; | ||
| 559 | 581 | ||
| 560 | return 0; | 582 | return ret; |
| 561 | } | 583 | } |
| 562 | 584 | ||
| 563 | static void probe_event_disable(struct trace_uprobe *tu, int flag) | 585 | static void probe_event_disable(struct trace_uprobe *tu, int flag) |
| 564 | { | 586 | { |
| 565 | if (!tu->inode || !tu->consumer) | 587 | if (!is_trace_uprobe_enabled(tu)) |
| 566 | return; | 588 | return; |
| 567 | 589 | ||
| 568 | uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); | 590 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); |
| 591 | |||
| 592 | uprobe_unregister(tu->inode, tu->offset, &tu->consumer); | ||
| 569 | tu->flags &= ~flag; | 593 | tu->flags &= ~flag; |
| 570 | kfree(tu->consumer); | ||
| 571 | tu->consumer = NULL; | ||
| 572 | } | 594 | } |
| 573 | 595 | ||
| 574 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | 596 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) |
| @@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu) | |||
| 642 | } | 664 | } |
| 643 | 665 | ||
| 644 | #ifdef CONFIG_PERF_EVENTS | 666 | #ifdef CONFIG_PERF_EVENTS |
| 667 | static bool | ||
| 668 | __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) | ||
| 669 | { | ||
| 670 | struct perf_event *event; | ||
| 671 | |||
| 672 | if (filter->nr_systemwide) | ||
| 673 | return true; | ||
| 674 | |||
| 675 | list_for_each_entry(event, &filter->perf_events, hw.tp_list) { | ||
| 676 | if (event->hw.tp_target->mm == mm) | ||
| 677 | return true; | ||
| 678 | } | ||
| 679 | |||
| 680 | return false; | ||
| 681 | } | ||
| 682 | |||
| 683 | static inline bool | ||
| 684 | uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) | ||
| 685 | { | ||
| 686 | return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); | ||
| 687 | } | ||
| 688 | |||
| 689 | static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) | ||
| 690 | { | ||
| 691 | bool done; | ||
| 692 | |||
| 693 | write_lock(&tu->filter.rwlock); | ||
| 694 | if (event->hw.tp_target) { | ||
| 695 | /* | ||
| 696 | * event->parent != NULL means copy_process(), we can avoid | ||
| 697 | * uprobe_apply(). current->mm must be probed and we can rely | ||
| 698 | * on dup_mmap() which preserves the already installed bp's. | ||
| 699 | * | ||
| 700 | * attr.enable_on_exec means that exec/mmap will install the | ||
| 701 | * breakpoints we need. | ||
| 702 | */ | ||
| 703 | done = tu->filter.nr_systemwide || | ||
| 704 | event->parent || event->attr.enable_on_exec || | ||
| 705 | uprobe_filter_event(tu, event); | ||
| 706 | list_add(&event->hw.tp_list, &tu->filter.perf_events); | ||
| 707 | } else { | ||
| 708 | done = tu->filter.nr_systemwide; | ||
| 709 | tu->filter.nr_systemwide++; | ||
| 710 | } | ||
| 711 | write_unlock(&tu->filter.rwlock); | ||
| 712 | |||
| 713 | if (!done) | ||
| 714 | uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); | ||
| 715 | |||
| 716 | return 0; | ||
| 717 | } | ||
| 718 | |||
| 719 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | ||
| 720 | { | ||
| 721 | bool done; | ||
| 722 | |||
| 723 | write_lock(&tu->filter.rwlock); | ||
| 724 | if (event->hw.tp_target) { | ||
| 725 | list_del(&event->hw.tp_list); | ||
| 726 | done = tu->filter.nr_systemwide || | ||
| 727 | (event->hw.tp_target->flags & PF_EXITING) || | ||
| 728 | uprobe_filter_event(tu, event); | ||
| 729 | } else { | ||
| 730 | tu->filter.nr_systemwide--; | ||
| 731 | done = tu->filter.nr_systemwide; | ||
| 732 | } | ||
| 733 | write_unlock(&tu->filter.rwlock); | ||
| 734 | |||
| 735 | if (!done) | ||
| 736 | uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); | ||
| 737 | |||
| 738 | return 0; | ||
| 739 | } | ||
| 740 | |||
| 741 | static bool uprobe_perf_filter(struct uprobe_consumer *uc, | ||
| 742 | enum uprobe_filter_ctx ctx, struct mm_struct *mm) | ||
| 743 | { | ||
| 744 | struct trace_uprobe *tu; | ||
| 745 | int ret; | ||
| 746 | |||
| 747 | tu = container_of(uc, struct trace_uprobe, consumer); | ||
| 748 | read_lock(&tu->filter.rwlock); | ||
| 749 | ret = __uprobe_perf_filter(&tu->filter, mm); | ||
| 750 | read_unlock(&tu->filter.rwlock); | ||
| 751 | |||
| 752 | return ret; | ||
| 753 | } | ||
| 754 | |||
| 645 | /* uprobe profile handler */ | 755 | /* uprobe profile handler */ |
| 646 | static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | 756 | static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) |
| 647 | { | 757 | { |
| 648 | struct ftrace_event_call *call = &tu->call; | 758 | struct ftrace_event_call *call = &tu->call; |
| 649 | struct uprobe_trace_entry_head *entry; | 759 | struct uprobe_trace_entry_head *entry; |
| @@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | |||
| 652 | int size, __size, i; | 762 | int size, __size, i; |
| 653 | int rctx; | 763 | int rctx; |
| 654 | 764 | ||
| 765 | if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) | ||
| 766 | return UPROBE_HANDLER_REMOVE; | ||
| 767 | |||
| 655 | __size = sizeof(*entry) + tu->size; | 768 | __size = sizeof(*entry) + tu->size; |
| 656 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 769 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 657 | size -= sizeof(u32); | 770 | size -= sizeof(u32); |
| 658 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) | 771 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) |
| 659 | return; | 772 | return 0; |
| 660 | 773 | ||
| 661 | preempt_disable(); | 774 | preempt_disable(); |
| 662 | 775 | ||
| @@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | |||
| 664 | if (!entry) | 777 | if (!entry) |
| 665 | goto out; | 778 | goto out; |
| 666 | 779 | ||
| 667 | entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); | 780 | entry->ip = instruction_pointer(task_pt_regs(current)); |
| 668 | data = (u8 *)&entry[1]; | 781 | data = (u8 *)&entry[1]; |
| 669 | for (i = 0; i < tu->nr_args; i++) | 782 | for (i = 0; i < tu->nr_args; i++) |
| 670 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 783 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); |
| @@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | |||
| 674 | 787 | ||
| 675 | out: | 788 | out: |
| 676 | preempt_enable(); | 789 | preempt_enable(); |
| 790 | return 0; | ||
| 677 | } | 791 | } |
| 678 | #endif /* CONFIG_PERF_EVENTS */ | 792 | #endif /* CONFIG_PERF_EVENTS */ |
| 679 | 793 | ||
| @@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, | |||
| 684 | 798 | ||
| 685 | switch (type) { | 799 | switch (type) { |
| 686 | case TRACE_REG_REGISTER: | 800 | case TRACE_REG_REGISTER: |
| 687 | return probe_event_enable(tu, TP_FLAG_TRACE); | 801 | return probe_event_enable(tu, TP_FLAG_TRACE, NULL); |
| 688 | 802 | ||
| 689 | case TRACE_REG_UNREGISTER: | 803 | case TRACE_REG_UNREGISTER: |
| 690 | probe_event_disable(tu, TP_FLAG_TRACE); | 804 | probe_event_disable(tu, TP_FLAG_TRACE); |
| @@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, | |||
| 692 | 806 | ||
| 693 | #ifdef CONFIG_PERF_EVENTS | 807 | #ifdef CONFIG_PERF_EVENTS |
| 694 | case TRACE_REG_PERF_REGISTER: | 808 | case TRACE_REG_PERF_REGISTER: |
| 695 | return probe_event_enable(tu, TP_FLAG_PROFILE); | 809 | return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); |
| 696 | 810 | ||
| 697 | case TRACE_REG_PERF_UNREGISTER: | 811 | case TRACE_REG_PERF_UNREGISTER: |
| 698 | probe_event_disable(tu, TP_FLAG_PROFILE); | 812 | probe_event_disable(tu, TP_FLAG_PROFILE); |
| 699 | return 0; | 813 | return 0; |
| 814 | |||
| 815 | case TRACE_REG_PERF_OPEN: | ||
| 816 | return uprobe_perf_open(tu, data); | ||
| 817 | |||
| 818 | case TRACE_REG_PERF_CLOSE: | ||
| 819 | return uprobe_perf_close(tu, data); | ||
| 820 | |||
| 700 | #endif | 821 | #endif |
| 701 | default: | 822 | default: |
| 702 | return 0; | 823 | return 0; |
| @@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, | |||
| 706 | 827 | ||
| 707 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) | 828 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) |
| 708 | { | 829 | { |
| 709 | struct uprobe_trace_consumer *utc; | ||
| 710 | struct trace_uprobe *tu; | 830 | struct trace_uprobe *tu; |
| 831 | int ret = 0; | ||
| 711 | 832 | ||
| 712 | utc = container_of(con, struct uprobe_trace_consumer, cons); | 833 | tu = container_of(con, struct trace_uprobe, consumer); |
| 713 | tu = utc->tu; | 834 | tu->nhit++; |
| 714 | if (!tu || tu->consumer != utc) | ||
| 715 | return 0; | ||
| 716 | 835 | ||
| 717 | if (tu->flags & TP_FLAG_TRACE) | 836 | if (tu->flags & TP_FLAG_TRACE) |
| 718 | uprobe_trace_func(tu, regs); | 837 | ret |= uprobe_trace_func(tu, regs); |
| 719 | 838 | ||
| 720 | #ifdef CONFIG_PERF_EVENTS | 839 | #ifdef CONFIG_PERF_EVENTS |
| 721 | if (tu->flags & TP_FLAG_PROFILE) | 840 | if (tu->flags & TP_FLAG_PROFILE) |
| 722 | uprobe_perf_func(tu, regs); | 841 | ret |= uprobe_perf_func(tu, regs); |
| 723 | #endif | 842 | #endif |
| 724 | return 0; | 843 | return ret; |
| 725 | } | 844 | } |
| 726 | 845 | ||
| 727 | static struct trace_event_functions uprobe_funcs = { | 846 | static struct trace_event_functions uprobe_funcs = { |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d96ba22dabfa..0c05a4592047 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
| @@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, | |||
| 192 | static struct tracepoint_entry *get_tracepoint(const char *name) | 192 | static struct tracepoint_entry *get_tracepoint(const char *name) |
| 193 | { | 193 | { |
| 194 | struct hlist_head *head; | 194 | struct hlist_head *head; |
| 195 | struct hlist_node *node; | ||
| 196 | struct tracepoint_entry *e; | 195 | struct tracepoint_entry *e; |
| 197 | u32 hash = jhash(name, strlen(name), 0); | 196 | u32 hash = jhash(name, strlen(name), 0); |
| 198 | 197 | ||
| 199 | head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; | 198 | head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; |
| 200 | hlist_for_each_entry(e, node, head, hlist) { | 199 | hlist_for_each_entry(e, head, hlist) { |
| 201 | if (!strcmp(name, e->name)) | 200 | if (!strcmp(name, e->name)) |
| 202 | return e; | 201 | return e; |
| 203 | } | 202 | } |
| @@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name) | |||
| 211 | static struct tracepoint_entry *add_tracepoint(const char *name) | 210 | static struct tracepoint_entry *add_tracepoint(const char *name) |
| 212 | { | 211 | { |
| 213 | struct hlist_head *head; | 212 | struct hlist_head *head; |
| 214 | struct hlist_node *node; | ||
| 215 | struct tracepoint_entry *e; | 213 | struct tracepoint_entry *e; |
| 216 | size_t name_len = strlen(name) + 1; | 214 | size_t name_len = strlen(name) + 1; |
| 217 | u32 hash = jhash(name, name_len-1, 0); | 215 | u32 hash = jhash(name, name_len-1, 0); |
| 218 | 216 | ||
| 219 | head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; | 217 | head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; |
| 220 | hlist_for_each_entry(e, node, head, hlist) { | 218 | hlist_for_each_entry(e, head, hlist) { |
| 221 | if (!strcmp(name, e->name)) { | 219 | if (!strcmp(name, e->name)) { |
| 222 | printk(KERN_NOTICE | 220 | printk(KERN_NOTICE |
| 223 | "tracepoint %s busy\n", name); | 221 | "tracepoint %s busy\n", name); |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 625df0b44690..a1dd9a1b1327 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
| @@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, | |||
| 32 | { | 32 | { |
| 33 | const struct cred *tcred; | 33 | const struct cred *tcred; |
| 34 | struct timespec uptime, ts; | 34 | struct timespec uptime, ts; |
| 35 | cputime_t utime, stime, utimescaled, stimescaled; | ||
| 35 | u64 ac_etime; | 36 | u64 ac_etime; |
| 36 | 37 | ||
| 37 | BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); | 38 | BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); |
| @@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns, | |||
| 65 | stats->ac_ppid = pid_alive(tsk) ? | 66 | stats->ac_ppid = pid_alive(tsk) ? |
| 66 | task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; | 67 | task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; |
| 67 | rcu_read_unlock(); | 68 | rcu_read_unlock(); |
| 68 | stats->ac_utime = cputime_to_usecs(tsk->utime); | 69 | |
| 69 | stats->ac_stime = cputime_to_usecs(tsk->stime); | 70 | task_cputime(tsk, &utime, &stime); |
| 70 | stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); | 71 | stats->ac_utime = cputime_to_usecs(utime); |
| 71 | stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); | 72 | stats->ac_stime = cputime_to_usecs(stime); |
| 73 | |||
| 74 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); | ||
| 75 | stats->ac_utimescaled = cputime_to_usecs(utimescaled); | ||
| 76 | stats->ac_stimescaled = cputime_to_usecs(stimescaled); | ||
| 77 | |||
| 72 | stats->ac_minflt = tsk->min_flt; | 78 | stats->ac_minflt = tsk->min_flt; |
| 73 | stats->ac_majflt = tsk->maj_flt; | 79 | stats->ac_majflt = tsk->maj_flt; |
| 74 | 80 | ||
| @@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
| 115 | #undef KB | 121 | #undef KB |
| 116 | #undef MB | 122 | #undef MB |
| 117 | 123 | ||
| 118 | /** | 124 | static void __acct_update_integrals(struct task_struct *tsk, |
| 119 | * acct_update_integrals - update mm integral fields in task_struct | 125 | cputime_t utime, cputime_t stime) |
| 120 | * @tsk: task_struct for accounting | ||
| 121 | */ | ||
| 122 | void acct_update_integrals(struct task_struct *tsk) | ||
| 123 | { | 126 | { |
| 124 | if (likely(tsk->mm)) { | 127 | if (likely(tsk->mm)) { |
| 125 | cputime_t time, dtime; | 128 | cputime_t time, dtime; |
| @@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
| 128 | u64 delta; | 131 | u64 delta; |
| 129 | 132 | ||
| 130 | local_irq_save(flags); | 133 | local_irq_save(flags); |
| 131 | time = tsk->stime + tsk->utime; | 134 | time = stime + utime; |
| 132 | dtime = time - tsk->acct_timexpd; | 135 | dtime = time - tsk->acct_timexpd; |
| 133 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); | 136 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); |
| 134 | delta = value.tv_sec; | 137 | delta = value.tv_sec; |
| @@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk) | |||
| 145 | } | 148 | } |
| 146 | 149 | ||
| 147 | /** | 150 | /** |
| 151 | * acct_update_integrals - update mm integral fields in task_struct | ||
| 152 | * @tsk: task_struct for accounting | ||
| 153 | */ | ||
| 154 | void acct_update_integrals(struct task_struct *tsk) | ||
| 155 | { | ||
| 156 | cputime_t utime, stime; | ||
| 157 | |||
| 158 | task_cputime(tsk, &utime, &stime); | ||
| 159 | __acct_update_integrals(tsk, utime, stime); | ||
| 160 | } | ||
| 161 | |||
| 162 | /** | ||
| 163 | * acct_account_cputime - update mm integral after cputime update | ||
| 164 | * @tsk: task_struct for accounting | ||
| 165 | */ | ||
| 166 | void acct_account_cputime(struct task_struct *tsk) | ||
| 167 | { | ||
| 168 | __acct_update_integrals(tsk, tsk->utime, tsk->stime); | ||
| 169 | } | ||
| 170 | |||
| 171 | /** | ||
| 148 | * acct_clear_integrals - clear the mm integral fields in task_struct | 172 | * acct_clear_integrals - clear the mm integral fields in task_struct |
| 149 | * @tsk: task_struct whose accounting fields are cleared | 173 | * @tsk: task_struct whose accounting fields are cleared |
| 150 | */ | 174 | */ |
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 1744bb80f1fb..394f70b17162 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
| @@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister); | |||
| 34 | void fire_user_return_notifiers(void) | 34 | void fire_user_return_notifiers(void) |
| 35 | { | 35 | { |
| 36 | struct user_return_notifier *urn; | 36 | struct user_return_notifier *urn; |
| 37 | struct hlist_node *tmp1, *tmp2; | 37 | struct hlist_node *tmp2; |
| 38 | struct hlist_head *head; | 38 | struct hlist_head *head; |
| 39 | 39 | ||
| 40 | head = &get_cpu_var(return_notifier_list); | 40 | head = &get_cpu_var(return_notifier_list); |
| 41 | hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) | 41 | hlist_for_each_entry_safe(urn, tmp2, head, link) |
| 42 | urn->on_user_return(urn); | 42 | urn->on_user_return(urn); |
| 43 | put_cpu_var(return_notifier_list); | 43 | put_cpu_var(return_notifier_list); |
| 44 | } | 44 | } |
diff --git a/kernel/user.c b/kernel/user.c index 33acb5e53a5f..e81978e8c03b 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -47,9 +47,7 @@ struct user_namespace init_user_ns = { | |||
| 47 | .count = 4294967295U, | 47 | .count = 4294967295U, |
| 48 | }, | 48 | }, |
| 49 | }, | 49 | }, |
| 50 | .kref = { | 50 | .count = ATOMIC_INIT(3), |
| 51 | .refcount = ATOMIC_INIT(3), | ||
| 52 | }, | ||
| 53 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
| 54 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
| 55 | .proc_inum = PROC_USER_INIT_INO, | 53 | .proc_inum = PROC_USER_INIT_INO, |
| @@ -107,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up) | |||
| 107 | static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) | 105 | static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) |
| 108 | { | 106 | { |
| 109 | struct user_struct *user; | 107 | struct user_struct *user; |
| 110 | struct hlist_node *h; | ||
| 111 | 108 | ||
| 112 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 109 | hlist_for_each_entry(user, hashent, uidhash_node) { |
| 113 | if (uid_eq(user->uid, uid)) { | 110 | if (uid_eq(user->uid, uid)) { |
| 114 | atomic_inc(&user->__count); | 111 | atomic_inc(&user->__count); |
| 115 | return user; | 112 | return user; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2b042c42fbc4..8b650837083e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -78,7 +78,7 @@ int create_user_ns(struct cred *new) | |||
| 78 | return ret; | 78 | return ret; |
| 79 | } | 79 | } |
| 80 | 80 | ||
| 81 | kref_init(&ns->kref); | 81 | atomic_set(&ns->count, 1); |
| 82 | /* Leave the new->user_ns reference with the new user namespace. */ | 82 | /* Leave the new->user_ns reference with the new user namespace. */ |
| 83 | ns->parent = parent_ns; | 83 | ns->parent = parent_ns; |
| 84 | ns->owner = owner; | 84 | ns->owner = owner; |
| @@ -104,15 +104,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | |||
| 104 | return create_user_ns(cred); | 104 | return create_user_ns(cred); |
| 105 | } | 105 | } |
| 106 | 106 | ||
| 107 | void free_user_ns(struct kref *kref) | 107 | void free_user_ns(struct user_namespace *ns) |
| 108 | { | 108 | { |
| 109 | struct user_namespace *parent, *ns = | 109 | struct user_namespace *parent; |
| 110 | container_of(kref, struct user_namespace, kref); | ||
| 111 | 110 | ||
| 112 | parent = ns->parent; | 111 | do { |
| 113 | proc_free_inum(ns->proc_inum); | 112 | parent = ns->parent; |
| 114 | kmem_cache_free(user_ns_cachep, ns); | 113 | proc_free_inum(ns->proc_inum); |
| 115 | put_user_ns(parent); | 114 | kmem_cache_free(user_ns_cachep, ns); |
| 115 | ns = parent; | ||
| 116 | } while (atomic_dec_and_test(&parent->count)); | ||
| 116 | } | 117 | } |
| 117 | EXPORT_SYMBOL(free_user_ns); | 118 | EXPORT_SYMBOL(free_user_ns); |
| 118 | 119 | ||
| @@ -519,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = { | |||
| 519 | .show = projid_m_show, | 520 | .show = projid_m_show, |
| 520 | }; | 521 | }; |
| 521 | 522 | ||
| 523 | static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) | ||
| 524 | { | ||
| 525 | u32 upper_first, lower_first, upper_last, lower_last; | ||
| 526 | unsigned idx; | ||
| 527 | |||
| 528 | upper_first = extent->first; | ||
| 529 | lower_first = extent->lower_first; | ||
| 530 | upper_last = upper_first + extent->count - 1; | ||
| 531 | lower_last = lower_first + extent->count - 1; | ||
| 532 | |||
| 533 | for (idx = 0; idx < new_map->nr_extents; idx++) { | ||
| 534 | u32 prev_upper_first, prev_lower_first; | ||
| 535 | u32 prev_upper_last, prev_lower_last; | ||
| 536 | struct uid_gid_extent *prev; | ||
| 537 | |||
| 538 | prev = &new_map->extent[idx]; | ||
| 539 | |||
| 540 | prev_upper_first = prev->first; | ||
| 541 | prev_lower_first = prev->lower_first; | ||
| 542 | prev_upper_last = prev_upper_first + prev->count - 1; | ||
| 543 | prev_lower_last = prev_lower_first + prev->count - 1; | ||
| 544 | |||
| 545 | /* Does the upper range intersect a previous extent? */ | ||
| 546 | if ((prev_upper_first <= upper_last) && | ||
| 547 | (prev_upper_last >= upper_first)) | ||
| 548 | return true; | ||
| 549 | |||
| 550 | /* Does the lower range intersect a previous extent? */ | ||
| 551 | if ((prev_lower_first <= lower_last) && | ||
| 552 | (prev_lower_last >= lower_first)) | ||
| 553 | return true; | ||
| 554 | } | ||
| 555 | return false; | ||
| 556 | } | ||
| 557 | |||
| 558 | |||
| 522 | static DEFINE_MUTEX(id_map_mutex); | 559 | static DEFINE_MUTEX(id_map_mutex); |
| 523 | 560 | ||
| 524 | static ssize_t map_write(struct file *file, const char __user *buf, | 561 | static ssize_t map_write(struct file *file, const char __user *buf, |
| @@ -531,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 531 | struct user_namespace *ns = seq->private; | 568 | struct user_namespace *ns = seq->private; |
| 532 | struct uid_gid_map new_map; | 569 | struct uid_gid_map new_map; |
| 533 | unsigned idx; | 570 | unsigned idx; |
| 534 | struct uid_gid_extent *extent, *last = NULL; | 571 | struct uid_gid_extent *extent = NULL; |
| 535 | unsigned long page = 0; | 572 | unsigned long page = 0; |
| 536 | char *kbuf, *pos, *next_line; | 573 | char *kbuf, *pos, *next_line; |
| 537 | ssize_t ret = -EINVAL; | 574 | ssize_t ret = -EINVAL; |
| @@ -634,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 634 | if ((extent->lower_first + extent->count) <= extent->lower_first) | 671 | if ((extent->lower_first + extent->count) <= extent->lower_first) |
| 635 | goto out; | 672 | goto out; |
| 636 | 673 | ||
| 637 | /* For now only accept extents that are strictly in order */ | 674 | /* Do the ranges in extent overlap any previous extents? */ |
| 638 | if (last && | 675 | if (mappings_overlap(&new_map, extent)) |
| 639 | (((last->first + last->count) > extent->first) || | ||
| 640 | ((last->lower_first + last->count) > extent->lower_first))) | ||
| 641 | goto out; | 676 | goto out; |
| 642 | 677 | ||
| 643 | new_map.nr_extents++; | 678 | new_map.nr_extents++; |
| 644 | last = extent; | ||
| 645 | 679 | ||
| 646 | /* Fail if the file contains too many extents */ | 680 | /* Fail if the file contains too many extents */ |
| 647 | if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && | 681 | if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 08b197e8c485..a47fc5de3113 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void) | |||
| 30 | /* | 30 | /* |
| 31 | * Clone a new ns copying an original utsname, setting refcount to 1 | 31 | * Clone a new ns copying an original utsname, setting refcount to 1 |
| 32 | * @old_ns: namespace to clone | 32 | * @old_ns: namespace to clone |
| 33 | * Return NULL on error (failure to kmalloc), new ns otherwise | 33 | * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise |
| 34 | */ | 34 | */ |
| 35 | static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | 35 | static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, |
| 36 | struct uts_namespace *old_ns) | 36 | struct uts_namespace *old_ns) |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 63da38c2d820..4f69f9a5e221 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
| @@ -15,6 +15,8 @@ | |||
| 15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
| 16 | #include <linux/wait.h> | 16 | #include <linux/wait.h> |
| 17 | 17 | ||
| 18 | #ifdef CONFIG_PROC_SYSCTL | ||
| 19 | |||
| 18 | static void *get_uts(ctl_table *table, int write) | 20 | static void *get_uts(ctl_table *table, int write) |
| 19 | { | 21 | { |
| 20 | char *which = table->data; | 22 | char *which = table->data; |
| @@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which) | |||
| 38 | up_write(&uts_sem); | 40 | up_write(&uts_sem); |
| 39 | } | 41 | } |
| 40 | 42 | ||
| 41 | #ifdef CONFIG_PROC_SYSCTL | ||
| 42 | /* | 43 | /* |
| 43 | * Special case of dostring for the UTS structure. This has locks | 44 | * Special case of dostring for the UTS structure. This has locks |
| 44 | * to observe. Should this be in kernel/sys.c ???? | 45 | * to observe. Should this be in kernel/sys.c ???? |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 75a2ab3d0b02..4a944676358e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
| 24 | #include <linux/sysctl.h> | 24 | #include <linux/sysctl.h> |
| 25 | #include <linux/smpboot.h> | 25 | #include <linux/smpboot.h> |
| 26 | #include <linux/sched/rt.h> | ||
| 26 | 27 | ||
| 27 | #include <asm/irq_regs.h> | 28 | #include <asm/irq_regs.h> |
| 28 | #include <linux/kvm_para.h> | 29 | #include <linux/kvm_para.h> |
| @@ -112,9 +113,9 @@ static int get_softlockup_thresh(void) | |||
| 112 | * resolution, and we don't need to waste time with a big divide when | 113 | * resolution, and we don't need to waste time with a big divide when |
| 113 | * 2^30ns == 1.074s. | 114 | * 2^30ns == 1.074s. |
| 114 | */ | 115 | */ |
| 115 | static unsigned long get_timestamp(int this_cpu) | 116 | static unsigned long get_timestamp(void) |
| 116 | { | 117 | { |
| 117 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | 118 | return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ |
| 118 | } | 119 | } |
| 119 | 120 | ||
| 120 | static void set_sample_period(void) | 121 | static void set_sample_period(void) |
| @@ -132,9 +133,7 @@ static void set_sample_period(void) | |||
| 132 | /* Commands for resetting the watchdog */ | 133 | /* Commands for resetting the watchdog */ |
| 133 | static void __touch_watchdog(void) | 134 | static void __touch_watchdog(void) |
| 134 | { | 135 | { |
| 135 | int this_cpu = smp_processor_id(); | 136 | __this_cpu_write(watchdog_touch_ts, get_timestamp()); |
| 136 | |||
| 137 | __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); | ||
| 138 | } | 137 | } |
| 139 | 138 | ||
| 140 | void touch_softlockup_watchdog(void) | 139 | void touch_softlockup_watchdog(void) |
| @@ -195,7 +194,7 @@ static int is_hardlockup(void) | |||
| 195 | 194 | ||
| 196 | static int is_softlockup(unsigned long touch_ts) | 195 | static int is_softlockup(unsigned long touch_ts) |
| 197 | { | 196 | { |
| 198 | unsigned long now = get_timestamp(smp_processor_id()); | 197 | unsigned long now = get_timestamp(); |
| 199 | 198 | ||
| 200 | /* Warn about unreasonable delays: */ | 199 | /* Warn about unreasonable delays: */ |
| 201 | if (time_after(now, touch_ts + get_softlockup_thresh())) | 200 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fbc6576a83c3..81f2457811eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -41,32 +41,31 @@ | |||
| 41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
| 42 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
| 43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
| 44 | #include <linux/hashtable.h> | ||
| 44 | 45 | ||
| 45 | #include "workqueue_sched.h" | 46 | #include "workqueue_internal.h" |
| 46 | 47 | ||
| 47 | enum { | 48 | enum { |
| 48 | /* | 49 | /* |
| 49 | * global_cwq flags | 50 | * worker_pool flags |
| 50 | * | 51 | * |
| 51 | * A bound gcwq is either associated or disassociated with its CPU. | 52 | * A bound pool is either associated or disassociated with its CPU. |
| 52 | * While associated (!DISASSOCIATED), all workers are bound to the | 53 | * While associated (!DISASSOCIATED), all workers are bound to the |
| 53 | * CPU and none has %WORKER_UNBOUND set and concurrency management | 54 | * CPU and none has %WORKER_UNBOUND set and concurrency management |
| 54 | * is in effect. | 55 | * is in effect. |
| 55 | * | 56 | * |
| 56 | * While DISASSOCIATED, the cpu may be offline and all workers have | 57 | * While DISASSOCIATED, the cpu may be offline and all workers have |
| 57 | * %WORKER_UNBOUND set and concurrency management disabled, and may | 58 | * %WORKER_UNBOUND set and concurrency management disabled, and may |
| 58 | * be executing on any CPU. The gcwq behaves as an unbound one. | 59 | * be executing on any CPU. The pool behaves as an unbound one. |
| 59 | * | 60 | * |
| 60 | * Note that DISASSOCIATED can be flipped only while holding | 61 | * Note that DISASSOCIATED can be flipped only while holding |
| 61 | * assoc_mutex of all pools on the gcwq to avoid changing binding | 62 | * assoc_mutex to avoid changing binding state while |
| 62 | * state while create_worker() is in progress. | 63 | * create_worker() is in progress. |
| 63 | */ | 64 | */ |
| 64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ | ||
| 65 | GCWQ_FREEZING = 1 << 1, /* freeze in progress */ | ||
| 66 | |||
| 67 | /* pool flags */ | ||
| 68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | 65 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ |
| 69 | POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ | 66 | POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ |
| 67 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | ||
| 68 | POOL_FREEZING = 1 << 3, /* freeze in progress */ | ||
| 70 | 69 | ||
| 71 | /* worker flags */ | 70 | /* worker flags */ |
| 72 | WORKER_STARTED = 1 << 0, /* started */ | 71 | WORKER_STARTED = 1 << 0, /* started */ |
| @@ -79,11 +78,9 @@ enum { | |||
| 79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | | 78 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | |
| 80 | WORKER_CPU_INTENSIVE, | 79 | WORKER_CPU_INTENSIVE, |
| 81 | 80 | ||
| 82 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ | 81 | NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ |
| 83 | 82 | ||
| 84 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | 83 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ |
| 85 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, | ||
| 86 | BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, | ||
| 87 | 84 | ||
| 88 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | 85 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
| 89 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | 86 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ |
| @@ -111,48 +108,24 @@ enum { | |||
| 111 | * P: Preemption protected. Disabling preemption is enough and should | 108 | * P: Preemption protected. Disabling preemption is enough and should |
| 112 | * only be modified and accessed from the local cpu. | 109 | * only be modified and accessed from the local cpu. |
| 113 | * | 110 | * |
| 114 | * L: gcwq->lock protected. Access with gcwq->lock held. | 111 | * L: pool->lock protected. Access with pool->lock held. |
| 115 | * | 112 | * |
| 116 | * X: During normal operation, modification requires gcwq->lock and | 113 | * X: During normal operation, modification requires pool->lock and should |
| 117 | * should be done only from local cpu. Either disabling preemption | 114 | * be done only from local cpu. Either disabling preemption on local |
| 118 | * on local cpu or grabbing gcwq->lock is enough for read access. | 115 | * cpu or grabbing pool->lock is enough for read access. If |
| 119 | * If GCWQ_DISASSOCIATED is set, it's identical to L. | 116 | * POOL_DISASSOCIATED is set, it's identical to L. |
| 120 | * | 117 | * |
| 121 | * F: wq->flush_mutex protected. | 118 | * F: wq->flush_mutex protected. |
| 122 | * | 119 | * |
| 123 | * W: workqueue_lock protected. | 120 | * W: workqueue_lock protected. |
| 124 | */ | 121 | */ |
| 125 | 122 | ||
| 126 | struct global_cwq; | 123 | /* struct worker is defined in workqueue_internal.h */ |
| 127 | struct worker_pool; | ||
| 128 | |||
| 129 | /* | ||
| 130 | * The poor guys doing the actual heavy lifting. All on-duty workers | ||
| 131 | * are either serving the manager role, on idle list or on busy hash. | ||
| 132 | */ | ||
| 133 | struct worker { | ||
| 134 | /* on idle list while idle, on busy hash table while busy */ | ||
| 135 | union { | ||
| 136 | struct list_head entry; /* L: while idle */ | ||
| 137 | struct hlist_node hentry; /* L: while busy */ | ||
| 138 | }; | ||
| 139 | |||
| 140 | struct work_struct *current_work; /* L: work being processed */ | ||
| 141 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ | ||
| 142 | struct list_head scheduled; /* L: scheduled works */ | ||
| 143 | struct task_struct *task; /* I: worker task */ | ||
| 144 | struct worker_pool *pool; /* I: the associated pool */ | ||
| 145 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | ||
| 146 | unsigned long last_active; /* L: last active timestamp */ | ||
| 147 | unsigned int flags; /* X: flags */ | ||
| 148 | int id; /* I: worker id */ | ||
| 149 | |||
| 150 | /* for rebinding worker to CPU */ | ||
| 151 | struct work_struct rebind_work; /* L: for busy worker */ | ||
| 152 | }; | ||
| 153 | 124 | ||
| 154 | struct worker_pool { | 125 | struct worker_pool { |
| 155 | struct global_cwq *gcwq; /* I: the owning gcwq */ | 126 | spinlock_t lock; /* the pool lock */ |
| 127 | unsigned int cpu; /* I: the associated cpu */ | ||
| 128 | int id; /* I: pool ID */ | ||
| 156 | unsigned int flags; /* X: flags */ | 129 | unsigned int flags; /* X: flags */ |
| 157 | 130 | ||
| 158 | struct list_head worklist; /* L: list of pending works */ | 131 | struct list_head worklist; /* L: list of pending works */ |
| @@ -165,34 +138,28 @@ struct worker_pool { | |||
| 165 | struct timer_list idle_timer; /* L: worker idle timeout */ | 138 | struct timer_list idle_timer; /* L: worker idle timeout */ |
| 166 | struct timer_list mayday_timer; /* L: SOS timer for workers */ | 139 | struct timer_list mayday_timer; /* L: SOS timer for workers */ |
| 167 | 140 | ||
| 168 | struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ | 141 | /* workers are chained either in busy_hash or idle_list */ |
| 169 | struct ida worker_ida; /* L: for worker IDs */ | 142 | DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); |
| 170 | }; | ||
| 171 | |||
| 172 | /* | ||
| 173 | * Global per-cpu workqueue. There's one and only one for each cpu | ||
| 174 | * and all works are queued and processed here regardless of their | ||
| 175 | * target workqueues. | ||
| 176 | */ | ||
| 177 | struct global_cwq { | ||
| 178 | spinlock_t lock; /* the gcwq lock */ | ||
| 179 | unsigned int cpu; /* I: the associated cpu */ | ||
| 180 | unsigned int flags; /* L: GCWQ_* flags */ | ||
| 181 | |||
| 182 | /* workers are chained either in busy_hash or pool idle_list */ | ||
| 183 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | ||
| 184 | /* L: hash of busy workers */ | 143 | /* L: hash of busy workers */ |
| 185 | 144 | ||
| 186 | struct worker_pool pools[NR_WORKER_POOLS]; | 145 | struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ |
| 187 | /* normal and highpri pools */ | 146 | struct ida worker_ida; /* L: for worker IDs */ |
| 147 | |||
| 148 | /* | ||
| 149 | * The current concurrency level. As it's likely to be accessed | ||
| 150 | * from other CPUs during try_to_wake_up(), put it in a separate | ||
| 151 | * cacheline. | ||
| 152 | */ | ||
| 153 | atomic_t nr_running ____cacheline_aligned_in_smp; | ||
| 188 | } ____cacheline_aligned_in_smp; | 154 | } ____cacheline_aligned_in_smp; |
| 189 | 155 | ||
| 190 | /* | 156 | /* |
| 191 | * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of | 157 | * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS |
| 192 | * work_struct->data are used for flags and thus cwqs need to be | 158 | * of work_struct->data are used for flags and the remaining high bits |
| 193 | * aligned at two's power of the number of flag bits. | 159 | * point to the pwq; thus, pwqs need to be aligned at two's power of the |
| 160 | * number of flag bits. | ||
| 194 | */ | 161 | */ |
| 195 | struct cpu_workqueue_struct { | 162 | struct pool_workqueue { |
| 196 | struct worker_pool *pool; /* I: the associated pool */ | 163 | struct worker_pool *pool; /* I: the associated pool */ |
| 197 | struct workqueue_struct *wq; /* I: the owning workqueue */ | 164 | struct workqueue_struct *wq; /* I: the owning workqueue */ |
| 198 | int work_color; /* L: current color */ | 165 | int work_color; /* L: current color */ |
| @@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t; | |||
| 241 | struct workqueue_struct { | 208 | struct workqueue_struct { |
| 242 | unsigned int flags; /* W: WQ_* flags */ | 209 | unsigned int flags; /* W: WQ_* flags */ |
| 243 | union { | 210 | union { |
| 244 | struct cpu_workqueue_struct __percpu *pcpu; | 211 | struct pool_workqueue __percpu *pcpu; |
| 245 | struct cpu_workqueue_struct *single; | 212 | struct pool_workqueue *single; |
| 246 | unsigned long v; | 213 | unsigned long v; |
| 247 | } cpu_wq; /* I: cwq's */ | 214 | } pool_wq; /* I: pwq's */ |
| 248 | struct list_head list; /* W: list of all workqueues */ | 215 | struct list_head list; /* W: list of all workqueues */ |
| 249 | 216 | ||
| 250 | struct mutex flush_mutex; /* protects wq flushing */ | 217 | struct mutex flush_mutex; /* protects wq flushing */ |
| 251 | int work_color; /* F: current work color */ | 218 | int work_color; /* F: current work color */ |
| 252 | int flush_color; /* F: current flush color */ | 219 | int flush_color; /* F: current flush color */ |
| 253 | atomic_t nr_cwqs_to_flush; /* flush in progress */ | 220 | atomic_t nr_pwqs_to_flush; /* flush in progress */ |
| 254 | struct wq_flusher *first_flusher; /* F: first flusher */ | 221 | struct wq_flusher *first_flusher; /* F: first flusher */ |
| 255 | struct list_head flusher_queue; /* F: flush waiters */ | 222 | struct list_head flusher_queue; /* F: flush waiters */ |
| 256 | struct list_head flusher_overflow; /* F: flush overflow list */ | 223 | struct list_head flusher_overflow; /* F: flush overflow list */ |
| @@ -259,7 +226,7 @@ struct workqueue_struct { | |||
| 259 | struct worker *rescuer; /* I: rescue worker */ | 226 | struct worker *rescuer; /* I: rescue worker */ |
| 260 | 227 | ||
| 261 | int nr_drainers; /* W: drain in progress */ | 228 | int nr_drainers; /* W: drain in progress */ |
| 262 | int saved_max_active; /* W: saved cwq max_active */ | 229 | int saved_max_active; /* W: saved pwq max_active */ |
| 263 | #ifdef CONFIG_LOCKDEP | 230 | #ifdef CONFIG_LOCKDEP |
| 264 | struct lockdep_map lockdep_map; | 231 | struct lockdep_map lockdep_map; |
| 265 | #endif | 232 | #endif |
| @@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); | |||
| 280 | #define CREATE_TRACE_POINTS | 247 | #define CREATE_TRACE_POINTS |
| 281 | #include <trace/events/workqueue.h> | 248 | #include <trace/events/workqueue.h> |
| 282 | 249 | ||
| 283 | #define for_each_worker_pool(pool, gcwq) \ | 250 | #define for_each_std_worker_pool(pool, cpu) \ |
| 284 | for ((pool) = &(gcwq)->pools[0]; \ | 251 | for ((pool) = &std_worker_pools(cpu)[0]; \ |
| 285 | (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) | 252 | (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) |
| 286 | 253 | ||
| 287 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | 254 | #define for_each_busy_worker(worker, i, pool) \ |
| 288 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | 255 | hash_for_each(pool->busy_hash, i, worker, hentry) |
| 289 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | ||
| 290 | 256 | ||
| 291 | static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, | 257 | static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, |
| 292 | unsigned int sw) | 258 | unsigned int sw) |
| 293 | { | 259 | { |
| 294 | if (cpu < nr_cpu_ids) { | 260 | if (cpu < nr_cpu_ids) { |
| 295 | if (sw & 1) { | 261 | if (sw & 1) { |
| @@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, | |||
| 300 | if (sw & 2) | 266 | if (sw & 2) |
| 301 | return WORK_CPU_UNBOUND; | 267 | return WORK_CPU_UNBOUND; |
| 302 | } | 268 | } |
| 303 | return WORK_CPU_NONE; | 269 | return WORK_CPU_END; |
| 304 | } | 270 | } |
| 305 | 271 | ||
| 306 | static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | 272 | static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, |
| 307 | struct workqueue_struct *wq) | 273 | struct workqueue_struct *wq) |
| 308 | { | 274 | { |
| 309 | return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); | 275 | return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); |
| 310 | } | 276 | } |
| 311 | 277 | ||
| 312 | /* | 278 | /* |
| 313 | * CPU iterators | 279 | * CPU iterators |
| 314 | * | 280 | * |
| 315 | * An extra gcwq is defined for an invalid cpu number | 281 | * An extra cpu number is defined using an invalid cpu number |
| 316 | * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any | 282 | * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any |
| 317 | * specific CPU. The following iterators are similar to | 283 | * specific CPU. The following iterators are similar to for_each_*_cpu() |
| 318 | * for_each_*_cpu() iterators but also considers the unbound gcwq. | 284 | * iterators but also considers the unbound CPU. |
| 319 | * | 285 | * |
| 320 | * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND | 286 | * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND |
| 321 | * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND | 287 | * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND |
| 322 | * for_each_cwq_cpu() : possible CPUs for bound workqueues, | 288 | * for_each_pwq_cpu() : possible CPUs for bound workqueues, |
| 323 | * WORK_CPU_UNBOUND for unbound workqueues | 289 | * WORK_CPU_UNBOUND for unbound workqueues |
| 324 | */ | 290 | */ |
| 325 | #define for_each_gcwq_cpu(cpu) \ | 291 | #define for_each_wq_cpu(cpu) \ |
| 326 | for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ | 292 | for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ |
| 327 | (cpu) < WORK_CPU_NONE; \ | 293 | (cpu) < WORK_CPU_END; \ |
| 328 | (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) | 294 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) |
| 329 | 295 | ||
| 330 | #define for_each_online_gcwq_cpu(cpu) \ | 296 | #define for_each_online_wq_cpu(cpu) \ |
| 331 | for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ | 297 | for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ |
| 332 | (cpu) < WORK_CPU_NONE; \ | 298 | (cpu) < WORK_CPU_END; \ |
| 333 | (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) | 299 | (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) |
| 334 | 300 | ||
| 335 | #define for_each_cwq_cpu(cpu, wq) \ | 301 | #define for_each_pwq_cpu(cpu, wq) \ |
| 336 | for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ | 302 | for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ |
| 337 | (cpu) < WORK_CPU_NONE; \ | 303 | (cpu) < WORK_CPU_END; \ |
| 338 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) | 304 | (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) |
| 339 | 305 | ||
| 340 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 306 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
| 341 | 307 | ||
| @@ -459,57 +425,69 @@ static LIST_HEAD(workqueues); | |||
| 459 | static bool workqueue_freezing; /* W: have wqs started freezing? */ | 425 | static bool workqueue_freezing; /* W: have wqs started freezing? */ |
| 460 | 426 | ||
| 461 | /* | 427 | /* |
| 462 | * The almighty global cpu workqueues. nr_running is the only field | 428 | * The CPU and unbound standard worker pools. The unbound ones have |
| 463 | * which is expected to be used frequently by other cpus via | 429 | * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. |
| 464 | * try_to_wake_up(). Put it in a separate cacheline. | ||
| 465 | */ | 430 | */ |
| 466 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); | 431 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], |
| 467 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); | 432 | cpu_std_worker_pools); |
| 433 | static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; | ||
| 468 | 434 | ||
| 469 | /* | 435 | /* idr of all pools */ |
| 470 | * Global cpu workqueue and nr_running counter for unbound gcwq. The | 436 | static DEFINE_MUTEX(worker_pool_idr_mutex); |
| 471 | * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its | 437 | static DEFINE_IDR(worker_pool_idr); |
| 472 | * workers have WORKER_UNBOUND set. | ||
| 473 | */ | ||
| 474 | static struct global_cwq unbound_global_cwq; | ||
| 475 | static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { | ||
| 476 | [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ | ||
| 477 | }; | ||
| 478 | 438 | ||
| 479 | static int worker_thread(void *__worker); | 439 | static int worker_thread(void *__worker); |
| 480 | 440 | ||
| 481 | static int worker_pool_pri(struct worker_pool *pool) | 441 | static struct worker_pool *std_worker_pools(int cpu) |
| 482 | { | 442 | { |
| 483 | return pool - pool->gcwq->pools; | 443 | if (cpu != WORK_CPU_UNBOUND) |
| 444 | return per_cpu(cpu_std_worker_pools, cpu); | ||
| 445 | else | ||
| 446 | return unbound_std_worker_pools; | ||
| 484 | } | 447 | } |
| 485 | 448 | ||
| 486 | static struct global_cwq *get_gcwq(unsigned int cpu) | 449 | static int std_worker_pool_pri(struct worker_pool *pool) |
| 487 | { | 450 | { |
| 488 | if (cpu != WORK_CPU_UNBOUND) | 451 | return pool - std_worker_pools(pool->cpu); |
| 489 | return &per_cpu(global_cwq, cpu); | ||
| 490 | else | ||
| 491 | return &unbound_global_cwq; | ||
| 492 | } | 452 | } |
| 493 | 453 | ||
| 494 | static atomic_t *get_pool_nr_running(struct worker_pool *pool) | 454 | /* allocate ID and assign it to @pool */ |
| 455 | static int worker_pool_assign_id(struct worker_pool *pool) | ||
| 495 | { | 456 | { |
| 496 | int cpu = pool->gcwq->cpu; | 457 | int ret; |
| 497 | int idx = worker_pool_pri(pool); | ||
| 498 | 458 | ||
| 499 | if (cpu != WORK_CPU_UNBOUND) | 459 | mutex_lock(&worker_pool_idr_mutex); |
| 500 | return &per_cpu(pool_nr_running, cpu)[idx]; | 460 | idr_pre_get(&worker_pool_idr, GFP_KERNEL); |
| 501 | else | 461 | ret = idr_get_new(&worker_pool_idr, pool, &pool->id); |
| 502 | return &unbound_pool_nr_running[idx]; | 462 | mutex_unlock(&worker_pool_idr_mutex); |
| 463 | |||
| 464 | return ret; | ||
| 503 | } | 465 | } |
| 504 | 466 | ||
| 505 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, | 467 | /* |
| 506 | struct workqueue_struct *wq) | 468 | * Lookup worker_pool by id. The idr currently is built during boot and |
| 469 | * never modified. Don't worry about locking for now. | ||
| 470 | */ | ||
| 471 | static struct worker_pool *worker_pool_by_id(int pool_id) | ||
| 472 | { | ||
| 473 | return idr_find(&worker_pool_idr, pool_id); | ||
| 474 | } | ||
| 475 | |||
| 476 | static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) | ||
| 477 | { | ||
| 478 | struct worker_pool *pools = std_worker_pools(cpu); | ||
| 479 | |||
| 480 | return &pools[highpri]; | ||
| 481 | } | ||
| 482 | |||
| 483 | static struct pool_workqueue *get_pwq(unsigned int cpu, | ||
| 484 | struct workqueue_struct *wq) | ||
| 507 | { | 485 | { |
| 508 | if (!(wq->flags & WQ_UNBOUND)) { | 486 | if (!(wq->flags & WQ_UNBOUND)) { |
| 509 | if (likely(cpu < nr_cpu_ids)) | 487 | if (likely(cpu < nr_cpu_ids)) |
| 510 | return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); | 488 | return per_cpu_ptr(wq->pool_wq.pcpu, cpu); |
| 511 | } else if (likely(cpu == WORK_CPU_UNBOUND)) | 489 | } else if (likely(cpu == WORK_CPU_UNBOUND)) |
| 512 | return wq->cpu_wq.single; | 490 | return wq->pool_wq.single; |
| 513 | return NULL; | 491 | return NULL; |
| 514 | } | 492 | } |
| 515 | 493 | ||
| @@ -530,19 +508,19 @@ static int work_next_color(int color) | |||
| 530 | } | 508 | } |
| 531 | 509 | ||
| 532 | /* | 510 | /* |
| 533 | * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data | 511 | * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data |
| 534 | * contain the pointer to the queued cwq. Once execution starts, the flag | 512 | * contain the pointer to the queued pwq. Once execution starts, the flag |
| 535 | * is cleared and the high bits contain OFFQ flags and CPU number. | 513 | * is cleared and the high bits contain OFFQ flags and pool ID. |
| 536 | * | 514 | * |
| 537 | * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() | 515 | * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() |
| 538 | * and clear_work_data() can be used to set the cwq, cpu or clear | 516 | * and clear_work_data() can be used to set the pwq, pool or clear |
| 539 | * work->data. These functions should only be called while the work is | 517 | * work->data. These functions should only be called while the work is |
| 540 | * owned - ie. while the PENDING bit is set. | 518 | * owned - ie. while the PENDING bit is set. |
| 541 | * | 519 | * |
| 542 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to | 520 | * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq |
| 543 | * a work. gcwq is available once the work has been queued anywhere after | 521 | * corresponding to a work. Pool is available once the work has been |
| 544 | * initialization until it is sync canceled. cwq is available only while | 522 | * queued anywhere after initialization until it is sync canceled. pwq is |
| 545 | * the work item is queued. | 523 | * available only while the work item is queued. |
| 546 | * | 524 | * |
| 547 | * %WORK_OFFQ_CANCELING is used to mark a work item which is being | 525 | * %WORK_OFFQ_CANCELING is used to mark a work item which is being |
| 548 | * canceled. While being canceled, a work item may have its PENDING set | 526 | * canceled. While being canceled, a work item may have its PENDING set |
| @@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data, | |||
| 556 | atomic_long_set(&work->data, data | flags | work_static(work)); | 534 | atomic_long_set(&work->data, data | flags | work_static(work)); |
| 557 | } | 535 | } |
| 558 | 536 | ||
| 559 | static void set_work_cwq(struct work_struct *work, | 537 | static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, |
| 560 | struct cpu_workqueue_struct *cwq, | ||
| 561 | unsigned long extra_flags) | 538 | unsigned long extra_flags) |
| 562 | { | 539 | { |
| 563 | set_work_data(work, (unsigned long)cwq, | 540 | set_work_data(work, (unsigned long)pwq, |
| 564 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); | 541 | WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); |
| 565 | } | 542 | } |
| 566 | 543 | ||
| 567 | static void set_work_cpu_and_clear_pending(struct work_struct *work, | 544 | static void set_work_pool_and_keep_pending(struct work_struct *work, |
| 568 | unsigned int cpu) | 545 | int pool_id) |
| 546 | { | ||
| 547 | set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, | ||
| 548 | WORK_STRUCT_PENDING); | ||
| 549 | } | ||
| 550 | |||
| 551 | static void set_work_pool_and_clear_pending(struct work_struct *work, | ||
| 552 | int pool_id) | ||
| 569 | { | 553 | { |
| 570 | /* | 554 | /* |
| 571 | * The following wmb is paired with the implied mb in | 555 | * The following wmb is paired with the implied mb in |
| @@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work, | |||
| 574 | * owner. | 558 | * owner. |
| 575 | */ | 559 | */ |
| 576 | smp_wmb(); | 560 | smp_wmb(); |
| 577 | set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); | 561 | set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); |
| 578 | } | 562 | } |
| 579 | 563 | ||
| 580 | static void clear_work_data(struct work_struct *work) | 564 | static void clear_work_data(struct work_struct *work) |
| 581 | { | 565 | { |
| 582 | smp_wmb(); /* see set_work_cpu_and_clear_pending() */ | 566 | smp_wmb(); /* see set_work_pool_and_clear_pending() */ |
| 583 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); | 567 | set_work_data(work, WORK_STRUCT_NO_POOL, 0); |
| 584 | } | 568 | } |
| 585 | 569 | ||
| 586 | static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) | 570 | static struct pool_workqueue *get_work_pwq(struct work_struct *work) |
| 587 | { | 571 | { |
| 588 | unsigned long data = atomic_long_read(&work->data); | 572 | unsigned long data = atomic_long_read(&work->data); |
| 589 | 573 | ||
| 590 | if (data & WORK_STRUCT_CWQ) | 574 | if (data & WORK_STRUCT_PWQ) |
| 591 | return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); | 575 | return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); |
| 592 | else | 576 | else |
| 593 | return NULL; | 577 | return NULL; |
| 594 | } | 578 | } |
| 595 | 579 | ||
| 596 | static struct global_cwq *get_work_gcwq(struct work_struct *work) | 580 | /** |
| 581 | * get_work_pool - return the worker_pool a given work was associated with | ||
| 582 | * @work: the work item of interest | ||
| 583 | * | ||
| 584 | * Return the worker_pool @work was last associated with. %NULL if none. | ||
| 585 | */ | ||
| 586 | static struct worker_pool *get_work_pool(struct work_struct *work) | ||
| 597 | { | 587 | { |
| 598 | unsigned long data = atomic_long_read(&work->data); | 588 | unsigned long data = atomic_long_read(&work->data); |
| 599 | unsigned int cpu; | 589 | struct worker_pool *pool; |
| 590 | int pool_id; | ||
| 600 | 591 | ||
| 601 | if (data & WORK_STRUCT_CWQ) | 592 | if (data & WORK_STRUCT_PWQ) |
| 602 | return ((struct cpu_workqueue_struct *) | 593 | return ((struct pool_workqueue *) |
| 603 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; | 594 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool; |
| 604 | 595 | ||
| 605 | cpu = data >> WORK_OFFQ_CPU_SHIFT; | 596 | pool_id = data >> WORK_OFFQ_POOL_SHIFT; |
| 606 | if (cpu == WORK_CPU_NONE) | 597 | if (pool_id == WORK_OFFQ_POOL_NONE) |
| 607 | return NULL; | 598 | return NULL; |
| 608 | 599 | ||
| 609 | BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); | 600 | pool = worker_pool_by_id(pool_id); |
| 610 | return get_gcwq(cpu); | 601 | WARN_ON_ONCE(!pool); |
| 602 | return pool; | ||
| 603 | } | ||
| 604 | |||
| 605 | /** | ||
| 606 | * get_work_pool_id - return the worker pool ID a given work is associated with | ||
| 607 | * @work: the work item of interest | ||
| 608 | * | ||
| 609 | * Return the worker_pool ID @work was last associated with. | ||
| 610 | * %WORK_OFFQ_POOL_NONE if none. | ||
| 611 | */ | ||
| 612 | static int get_work_pool_id(struct work_struct *work) | ||
| 613 | { | ||
| 614 | unsigned long data = atomic_long_read(&work->data); | ||
| 615 | |||
| 616 | if (data & WORK_STRUCT_PWQ) | ||
| 617 | return ((struct pool_workqueue *) | ||
| 618 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; | ||
| 619 | |||
| 620 | return data >> WORK_OFFQ_POOL_SHIFT; | ||
| 611 | } | 621 | } |
| 612 | 622 | ||
| 613 | static void mark_work_canceling(struct work_struct *work) | 623 | static void mark_work_canceling(struct work_struct *work) |
| 614 | { | 624 | { |
| 615 | struct global_cwq *gcwq = get_work_gcwq(work); | 625 | unsigned long pool_id = get_work_pool_id(work); |
| 616 | unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
| 617 | 626 | ||
| 618 | set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, | 627 | pool_id <<= WORK_OFFQ_POOL_SHIFT; |
| 619 | WORK_STRUCT_PENDING); | 628 | set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); |
| 620 | } | 629 | } |
| 621 | 630 | ||
| 622 | static bool work_is_canceling(struct work_struct *work) | 631 | static bool work_is_canceling(struct work_struct *work) |
| 623 | { | 632 | { |
| 624 | unsigned long data = atomic_long_read(&work->data); | 633 | unsigned long data = atomic_long_read(&work->data); |
| 625 | 634 | ||
| 626 | return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); | 635 | return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); |
| 627 | } | 636 | } |
| 628 | 637 | ||
| 629 | /* | 638 | /* |
| 630 | * Policy functions. These define the policies on how the global worker | 639 | * Policy functions. These define the policies on how the global worker |
| 631 | * pools are managed. Unless noted otherwise, these functions assume that | 640 | * pools are managed. Unless noted otherwise, these functions assume that |
| 632 | * they're being called with gcwq->lock held. | 641 | * they're being called with pool->lock held. |
| 633 | */ | 642 | */ |
| 634 | 643 | ||
| 635 | static bool __need_more_worker(struct worker_pool *pool) | 644 | static bool __need_more_worker(struct worker_pool *pool) |
| 636 | { | 645 | { |
| 637 | return !atomic_read(get_pool_nr_running(pool)); | 646 | return !atomic_read(&pool->nr_running); |
| 638 | } | 647 | } |
| 639 | 648 | ||
| 640 | /* | 649 | /* |
| @@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool) | |||
| 642 | * running workers. | 651 | * running workers. |
| 643 | * | 652 | * |
| 644 | * Note that, because unbound workers never contribute to nr_running, this | 653 | * Note that, because unbound workers never contribute to nr_running, this |
| 645 | * function will always return %true for unbound gcwq as long as the | 654 | * function will always return %true for unbound pools as long as the |
| 646 | * worklist isn't empty. | 655 | * worklist isn't empty. |
| 647 | */ | 656 | */ |
| 648 | static bool need_more_worker(struct worker_pool *pool) | 657 | static bool need_more_worker(struct worker_pool *pool) |
| @@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool) | |||
| 659 | /* Do I need to keep working? Called from currently running workers. */ | 668 | /* Do I need to keep working? Called from currently running workers. */ |
| 660 | static bool keep_working(struct worker_pool *pool) | 669 | static bool keep_working(struct worker_pool *pool) |
| 661 | { | 670 | { |
| 662 | atomic_t *nr_running = get_pool_nr_running(pool); | 671 | return !list_empty(&pool->worklist) && |
| 663 | 672 | atomic_read(&pool->nr_running) <= 1; | |
| 664 | return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; | ||
| 665 | } | 673 | } |
| 666 | 674 | ||
| 667 | /* Do we need a new worker? Called from manager. */ | 675 | /* Do we need a new worker? Called from manager. */ |
| @@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool) | |||
| 714 | * Wake up the first idle worker of @pool. | 722 | * Wake up the first idle worker of @pool. |
| 715 | * | 723 | * |
| 716 | * CONTEXT: | 724 | * CONTEXT: |
| 717 | * spin_lock_irq(gcwq->lock). | 725 | * spin_lock_irq(pool->lock). |
| 718 | */ | 726 | */ |
| 719 | static void wake_up_worker(struct worker_pool *pool) | 727 | static void wake_up_worker(struct worker_pool *pool) |
| 720 | { | 728 | { |
| @@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
| 740 | struct worker *worker = kthread_data(task); | 748 | struct worker *worker = kthread_data(task); |
| 741 | 749 | ||
| 742 | if (!(worker->flags & WORKER_NOT_RUNNING)) { | 750 | if (!(worker->flags & WORKER_NOT_RUNNING)) { |
| 743 | WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); | 751 | WARN_ON_ONCE(worker->pool->cpu != cpu); |
| 744 | atomic_inc(get_pool_nr_running(worker->pool)); | 752 | atomic_inc(&worker->pool->nr_running); |
| 745 | } | 753 | } |
| 746 | } | 754 | } |
| 747 | 755 | ||
| @@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
| 764 | unsigned int cpu) | 772 | unsigned int cpu) |
| 765 | { | 773 | { |
| 766 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | 774 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; |
| 767 | struct worker_pool *pool = worker->pool; | 775 | struct worker_pool *pool; |
| 768 | atomic_t *nr_running = get_pool_nr_running(pool); | ||
| 769 | 776 | ||
| 777 | /* | ||
| 778 | * Rescuers, which may not have all the fields set up like normal | ||
| 779 | * workers, also reach here, let's not access anything before | ||
| 780 | * checking NOT_RUNNING. | ||
| 781 | */ | ||
| 770 | if (worker->flags & WORKER_NOT_RUNNING) | 782 | if (worker->flags & WORKER_NOT_RUNNING) |
| 771 | return NULL; | 783 | return NULL; |
| 772 | 784 | ||
| 785 | pool = worker->pool; | ||
| 786 | |||
| 773 | /* this can only happen on the local cpu */ | 787 | /* this can only happen on the local cpu */ |
| 774 | BUG_ON(cpu != raw_smp_processor_id()); | 788 | BUG_ON(cpu != raw_smp_processor_id()); |
| 775 | 789 | ||
| @@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
| 781 | * NOT_RUNNING is clear. This means that we're bound to and | 795 | * NOT_RUNNING is clear. This means that we're bound to and |
| 782 | * running on the local cpu w/ rq lock held and preemption | 796 | * running on the local cpu w/ rq lock held and preemption |
| 783 | * disabled, which in turn means that none else could be | 797 | * disabled, which in turn means that none else could be |
| 784 | * manipulating idle_list, so dereferencing idle_list without gcwq | 798 | * manipulating idle_list, so dereferencing idle_list without pool |
| 785 | * lock is safe. | 799 | * lock is safe. |
| 786 | */ | 800 | */ |
| 787 | if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) | 801 | if (atomic_dec_and_test(&pool->nr_running) && |
| 802 | !list_empty(&pool->worklist)) | ||
| 788 | to_wakeup = first_worker(pool); | 803 | to_wakeup = first_worker(pool); |
| 789 | return to_wakeup ? to_wakeup->task : NULL; | 804 | return to_wakeup ? to_wakeup->task : NULL; |
| 790 | } | 805 | } |
| @@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
| 800 | * woken up. | 815 | * woken up. |
| 801 | * | 816 | * |
| 802 | * CONTEXT: | 817 | * CONTEXT: |
| 803 | * spin_lock_irq(gcwq->lock) | 818 | * spin_lock_irq(pool->lock) |
| 804 | */ | 819 | */ |
| 805 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, | 820 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, |
| 806 | bool wakeup) | 821 | bool wakeup) |
| @@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, | |||
| 816 | */ | 831 | */ |
| 817 | if ((flags & WORKER_NOT_RUNNING) && | 832 | if ((flags & WORKER_NOT_RUNNING) && |
| 818 | !(worker->flags & WORKER_NOT_RUNNING)) { | 833 | !(worker->flags & WORKER_NOT_RUNNING)) { |
| 819 | atomic_t *nr_running = get_pool_nr_running(pool); | ||
| 820 | |||
| 821 | if (wakeup) { | 834 | if (wakeup) { |
| 822 | if (atomic_dec_and_test(nr_running) && | 835 | if (atomic_dec_and_test(&pool->nr_running) && |
| 823 | !list_empty(&pool->worklist)) | 836 | !list_empty(&pool->worklist)) |
| 824 | wake_up_worker(pool); | 837 | wake_up_worker(pool); |
| 825 | } else | 838 | } else |
| 826 | atomic_dec(nr_running); | 839 | atomic_dec(&pool->nr_running); |
| 827 | } | 840 | } |
| 828 | 841 | ||
| 829 | worker->flags |= flags; | 842 | worker->flags |= flags; |
| @@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, | |||
| 837 | * Clear @flags in @worker->flags and adjust nr_running accordingly. | 850 | * Clear @flags in @worker->flags and adjust nr_running accordingly. |
| 838 | * | 851 | * |
| 839 | * CONTEXT: | 852 | * CONTEXT: |
| 840 | * spin_lock_irq(gcwq->lock) | 853 | * spin_lock_irq(pool->lock) |
| 841 | */ | 854 | */ |
| 842 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | 855 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) |
| 843 | { | 856 | { |
| @@ -855,87 +868,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
| 855 | */ | 868 | */ |
| 856 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 869 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
| 857 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 870 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
| 858 | atomic_inc(get_pool_nr_running(pool)); | 871 | atomic_inc(&pool->nr_running); |
| 859 | } | 872 | } |
| 860 | 873 | ||
| 861 | /** | 874 | /** |
| 862 | * busy_worker_head - return the busy hash head for a work | 875 | * find_worker_executing_work - find worker which is executing a work |
| 863 | * @gcwq: gcwq of interest | 876 | * @pool: pool of interest |
| 864 | * @work: work to be hashed | ||
| 865 | * | ||
| 866 | * Return hash head of @gcwq for @work. | ||
| 867 | * | ||
| 868 | * CONTEXT: | ||
| 869 | * spin_lock_irq(gcwq->lock). | ||
| 870 | * | ||
| 871 | * RETURNS: | ||
| 872 | * Pointer to the hash head. | ||
| 873 | */ | ||
| 874 | static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, | ||
| 875 | struct work_struct *work) | ||
| 876 | { | ||
| 877 | const int base_shift = ilog2(sizeof(struct work_struct)); | ||
| 878 | unsigned long v = (unsigned long)work; | ||
| 879 | |||
| 880 | /* simple shift and fold hash, do we need something better? */ | ||
| 881 | v >>= base_shift; | ||
| 882 | v += v >> BUSY_WORKER_HASH_ORDER; | ||
| 883 | v &= BUSY_WORKER_HASH_MASK; | ||
| 884 | |||
| 885 | return &gcwq->busy_hash[v]; | ||
| 886 | } | ||
| 887 | |||
| 888 | /** | ||
| 889 | * __find_worker_executing_work - find worker which is executing a work | ||
| 890 | * @gcwq: gcwq of interest | ||
| 891 | * @bwh: hash head as returned by busy_worker_head() | ||
| 892 | * @work: work to find worker for | 877 | * @work: work to find worker for |
| 893 | * | 878 | * |
| 894 | * Find a worker which is executing @work on @gcwq. @bwh should be | 879 | * Find a worker which is executing @work on @pool by searching |
| 895 | * the hash head obtained by calling busy_worker_head() with the same | 880 | * @pool->busy_hash which is keyed by the address of @work. For a worker |
| 896 | * work. | 881 | * to match, its current execution should match the address of @work and |
| 882 | * its work function. This is to avoid unwanted dependency between | ||
| 883 | * unrelated work executions through a work item being recycled while still | ||
| 884 | * being executed. | ||
| 885 | * | ||
| 886 | * This is a bit tricky. A work item may be freed once its execution | ||
| 887 | * starts and nothing prevents the freed area from being recycled for | ||
| 888 | * another work item. If the same work item address ends up being reused | ||
| 889 | * before the original execution finishes, workqueue will identify the | ||
| 890 | * recycled work item as currently executing and make it wait until the | ||
| 891 | * current execution finishes, introducing an unwanted dependency. | ||
| 892 | * | ||
| 893 | * This function checks the work item address, work function and workqueue | ||
| 894 | * to avoid false positives. Note that this isn't complete as one may | ||
| 895 | * construct a work function which can introduce dependency onto itself | ||
| 896 | * through a recycled work item. Well, if somebody wants to shoot oneself | ||
| 897 | * in the foot that badly, there's only so much we can do, and if such | ||
| 898 | * deadlock actually occurs, it should be easy to locate the culprit work | ||
| 899 | * function. | ||
| 897 | * | 900 | * |
| 898 | * CONTEXT: | 901 | * CONTEXT: |
| 899 | * spin_lock_irq(gcwq->lock). | 902 | * spin_lock_irq(pool->lock). |
| 900 | * | 903 | * |
| 901 | * RETURNS: | 904 | * RETURNS: |
| 902 | * Pointer to worker which is executing @work if found, NULL | 905 | * Pointer to worker which is executing @work if found, NULL |
| 903 | * otherwise. | 906 | * otherwise. |
| 904 | */ | 907 | */ |
| 905 | static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, | 908 | static struct worker *find_worker_executing_work(struct worker_pool *pool, |
| 906 | struct hlist_head *bwh, | 909 | struct work_struct *work) |
| 907 | struct work_struct *work) | ||
| 908 | { | 910 | { |
| 909 | struct worker *worker; | 911 | struct worker *worker; |
| 910 | struct hlist_node *tmp; | ||
| 911 | 912 | ||
| 912 | hlist_for_each_entry(worker, tmp, bwh, hentry) | 913 | hash_for_each_possible(pool->busy_hash, worker, hentry, |
| 913 | if (worker->current_work == work) | 914 | (unsigned long)work) |
| 915 | if (worker->current_work == work && | ||
| 916 | worker->current_func == work->func) | ||
| 914 | return worker; | 917 | return worker; |
| 915 | return NULL; | ||
| 916 | } | ||
| 917 | 918 | ||
| 918 | /** | 919 | return NULL; |
| 919 | * find_worker_executing_work - find worker which is executing a work | ||
| 920 | * @gcwq: gcwq of interest | ||
| 921 | * @work: work to find worker for | ||
| 922 | * | ||
| 923 | * Find a worker which is executing @work on @gcwq. This function is | ||
| 924 | * identical to __find_worker_executing_work() except that this | ||
| 925 | * function calculates @bwh itself. | ||
| 926 | * | ||
| 927 | * CONTEXT: | ||
| 928 | * spin_lock_irq(gcwq->lock). | ||
| 929 | * | ||
| 930 | * RETURNS: | ||
| 931 | * Pointer to worker which is executing @work if found, NULL | ||
| 932 | * otherwise. | ||
| 933 | */ | ||
| 934 | static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | ||
| 935 | struct work_struct *work) | ||
| 936 | { | ||
| 937 | return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), | ||
| 938 | work); | ||
| 939 | } | 920 | } |
| 940 | 921 | ||
| 941 | /** | 922 | /** |
| @@ -953,7 +934,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | |||
| 953 | * nested inside outer list_for_each_entry_safe(). | 934 | * nested inside outer list_for_each_entry_safe(). |
| 954 | * | 935 | * |
| 955 | * CONTEXT: | 936 | * CONTEXT: |
| 956 | * spin_lock_irq(gcwq->lock). | 937 | * spin_lock_irq(pool->lock). |
| 957 | */ | 938 | */ |
| 958 | static void move_linked_works(struct work_struct *work, struct list_head *head, | 939 | static void move_linked_works(struct work_struct *work, struct list_head *head, |
| 959 | struct work_struct **nextp) | 940 | struct work_struct **nextp) |
| @@ -979,67 +960,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, | |||
| 979 | *nextp = n; | 960 | *nextp = n; |
| 980 | } | 961 | } |
| 981 | 962 | ||
| 982 | static void cwq_activate_delayed_work(struct work_struct *work) | 963 | static void pwq_activate_delayed_work(struct work_struct *work) |
| 983 | { | 964 | { |
| 984 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | 965 | struct pool_workqueue *pwq = get_work_pwq(work); |
| 985 | 966 | ||
| 986 | trace_workqueue_activate_work(work); | 967 | trace_workqueue_activate_work(work); |
| 987 | move_linked_works(work, &cwq->pool->worklist, NULL); | 968 | move_linked_works(work, &pwq->pool->worklist, NULL); |
| 988 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | 969 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); |
| 989 | cwq->nr_active++; | 970 | pwq->nr_active++; |
| 990 | } | 971 | } |
| 991 | 972 | ||
| 992 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | 973 | static void pwq_activate_first_delayed(struct pool_workqueue *pwq) |
| 993 | { | 974 | { |
| 994 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | 975 | struct work_struct *work = list_first_entry(&pwq->delayed_works, |
| 995 | struct work_struct, entry); | 976 | struct work_struct, entry); |
| 996 | 977 | ||
| 997 | cwq_activate_delayed_work(work); | 978 | pwq_activate_delayed_work(work); |
| 998 | } | 979 | } |
| 999 | 980 | ||
| 1000 | /** | 981 | /** |
| 1001 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | 982 | * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight |
| 1002 | * @cwq: cwq of interest | 983 | * @pwq: pwq of interest |
| 1003 | * @color: color of work which left the queue | 984 | * @color: color of work which left the queue |
| 1004 | * | 985 | * |
| 1005 | * A work either has completed or is removed from pending queue, | 986 | * A work either has completed or is removed from pending queue, |
| 1006 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | 987 | * decrement nr_in_flight of its pwq and handle workqueue flushing. |
| 1007 | * | 988 | * |
| 1008 | * CONTEXT: | 989 | * CONTEXT: |
| 1009 | * spin_lock_irq(gcwq->lock). | 990 | * spin_lock_irq(pool->lock). |
| 1010 | */ | 991 | */ |
| 1011 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) | 992 | static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) |
| 1012 | { | 993 | { |
| 1013 | /* ignore uncolored works */ | 994 | /* ignore uncolored works */ |
| 1014 | if (color == WORK_NO_COLOR) | 995 | if (color == WORK_NO_COLOR) |
| 1015 | return; | 996 | return; |
| 1016 | 997 | ||
| 1017 | cwq->nr_in_flight[color]--; | 998 | pwq->nr_in_flight[color]--; |
| 1018 | 999 | ||
| 1019 | cwq->nr_active--; | 1000 | pwq->nr_active--; |
| 1020 | if (!list_empty(&cwq->delayed_works)) { | 1001 | if (!list_empty(&pwq->delayed_works)) { |
| 1021 | /* one down, submit a delayed one */ | 1002 | /* one down, submit a delayed one */ |
| 1022 | if (cwq->nr_active < cwq->max_active) | 1003 | if (pwq->nr_active < pwq->max_active) |
| 1023 | cwq_activate_first_delayed(cwq); | 1004 | pwq_activate_first_delayed(pwq); |
| 1024 | } | 1005 | } |
| 1025 | 1006 | ||
| 1026 | /* is flush in progress and are we at the flushing tip? */ | 1007 | /* is flush in progress and are we at the flushing tip? */ |
| 1027 | if (likely(cwq->flush_color != color)) | 1008 | if (likely(pwq->flush_color != color)) |
| 1028 | return; | 1009 | return; |
| 1029 | 1010 | ||
| 1030 | /* are there still in-flight works? */ | 1011 | /* are there still in-flight works? */ |
| 1031 | if (cwq->nr_in_flight[color]) | 1012 | if (pwq->nr_in_flight[color]) |
| 1032 | return; | 1013 | return; |
| 1033 | 1014 | ||
| 1034 | /* this cwq is done, clear flush_color */ | 1015 | /* this pwq is done, clear flush_color */ |
| 1035 | cwq->flush_color = -1; | 1016 | pwq->flush_color = -1; |
| 1036 | 1017 | ||
| 1037 | /* | 1018 | /* |
| 1038 | * If this was the last cwq, wake up the first flusher. It | 1019 | * If this was the last pwq, wake up the first flusher. It |
| 1039 | * will handle the rest. | 1020 | * will handle the rest. |
| 1040 | */ | 1021 | */ |
| 1041 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | 1022 | if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) |
| 1042 | complete(&cwq->wq->first_flusher->done); | 1023 | complete(&pwq->wq->first_flusher->done); |
| 1043 | } | 1024 | } |
| 1044 | 1025 | ||
| 1045 | /** | 1026 | /** |
| @@ -1070,7 +1051,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) | |||
| 1070 | static int try_to_grab_pending(struct work_struct *work, bool is_dwork, | 1051 | static int try_to_grab_pending(struct work_struct *work, bool is_dwork, |
| 1071 | unsigned long *flags) | 1052 | unsigned long *flags) |
| 1072 | { | 1053 | { |
| 1073 | struct global_cwq *gcwq; | 1054 | struct worker_pool *pool; |
| 1055 | struct pool_workqueue *pwq; | ||
| 1074 | 1056 | ||
| 1075 | local_irq_save(*flags); | 1057 | local_irq_save(*flags); |
| 1076 | 1058 | ||
| @@ -1095,41 +1077,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, | |||
| 1095 | * The queueing is in progress, or it is already queued. Try to | 1077 | * The queueing is in progress, or it is already queued. Try to |
| 1096 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | 1078 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. |
| 1097 | */ | 1079 | */ |
| 1098 | gcwq = get_work_gcwq(work); | 1080 | pool = get_work_pool(work); |
| 1099 | if (!gcwq) | 1081 | if (!pool) |
| 1100 | goto fail; | 1082 | goto fail; |
| 1101 | 1083 | ||
| 1102 | spin_lock(&gcwq->lock); | 1084 | spin_lock(&pool->lock); |
| 1103 | if (!list_empty(&work->entry)) { | 1085 | /* |
| 1086 | * work->data is guaranteed to point to pwq only while the work | ||
| 1087 | * item is queued on pwq->wq, and both updating work->data to point | ||
| 1088 | * to pwq on queueing and to pool on dequeueing are done under | ||
| 1089 | * pwq->pool->lock. This in turn guarantees that, if work->data | ||
| 1090 | * points to pwq which is associated with a locked pool, the work | ||
| 1091 | * item is currently queued on that pool. | ||
| 1092 | */ | ||
| 1093 | pwq = get_work_pwq(work); | ||
| 1094 | if (pwq && pwq->pool == pool) { | ||
| 1095 | debug_work_deactivate(work); | ||
| 1096 | |||
| 1104 | /* | 1097 | /* |
| 1105 | * This work is queued, but perhaps we locked the wrong gcwq. | 1098 | * A delayed work item cannot be grabbed directly because |
| 1106 | * In that case we must see the new value after rmb(), see | 1099 | * it might have linked NO_COLOR work items which, if left |
| 1107 | * insert_work()->wmb(). | 1100 | * on the delayed_list, will confuse pwq->nr_active |
| 1101 | * management later on and cause stall. Make sure the work | ||
| 1102 | * item is activated before grabbing. | ||
| 1108 | */ | 1103 | */ |
| 1109 | smp_rmb(); | 1104 | if (*work_data_bits(work) & WORK_STRUCT_DELAYED) |
| 1110 | if (gcwq == get_work_gcwq(work)) { | 1105 | pwq_activate_delayed_work(work); |
| 1111 | debug_work_deactivate(work); | ||
| 1112 | 1106 | ||
| 1113 | /* | 1107 | list_del_init(&work->entry); |
| 1114 | * A delayed work item cannot be grabbed directly | 1108 | pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); |
| 1115 | * because it might have linked NO_COLOR work items | ||
| 1116 | * which, if left on the delayed_list, will confuse | ||
| 1117 | * cwq->nr_active management later on and cause | ||
| 1118 | * stall. Make sure the work item is activated | ||
| 1119 | * before grabbing. | ||
| 1120 | */ | ||
| 1121 | if (*work_data_bits(work) & WORK_STRUCT_DELAYED) | ||
| 1122 | cwq_activate_delayed_work(work); | ||
| 1123 | 1109 | ||
| 1124 | list_del_init(&work->entry); | 1110 | /* work->data points to pwq iff queued, point to pool */ |
| 1125 | cwq_dec_nr_in_flight(get_work_cwq(work), | 1111 | set_work_pool_and_keep_pending(work, pool->id); |
| 1126 | get_work_color(work)); | ||
| 1127 | 1112 | ||
| 1128 | spin_unlock(&gcwq->lock); | 1113 | spin_unlock(&pool->lock); |
| 1129 | return 1; | 1114 | return 1; |
| 1130 | } | ||
| 1131 | } | 1115 | } |
| 1132 | spin_unlock(&gcwq->lock); | 1116 | spin_unlock(&pool->lock); |
| 1133 | fail: | 1117 | fail: |
| 1134 | local_irq_restore(*flags); | 1118 | local_irq_restore(*flags); |
| 1135 | if (work_is_canceling(work)) | 1119 | if (work_is_canceling(work)) |
| @@ -1139,33 +1123,25 @@ fail: | |||
| 1139 | } | 1123 | } |
| 1140 | 1124 | ||
| 1141 | /** | 1125 | /** |
| 1142 | * insert_work - insert a work into gcwq | 1126 | * insert_work - insert a work into a pool |
| 1143 | * @cwq: cwq @work belongs to | 1127 | * @pwq: pwq @work belongs to |
| 1144 | * @work: work to insert | 1128 | * @work: work to insert |
| 1145 | * @head: insertion point | 1129 | * @head: insertion point |
| 1146 | * @extra_flags: extra WORK_STRUCT_* flags to set | 1130 | * @extra_flags: extra WORK_STRUCT_* flags to set |
| 1147 | * | 1131 | * |
| 1148 | * Insert @work which belongs to @cwq into @gcwq after @head. | 1132 | * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to |
| 1149 | * @extra_flags is or'd to work_struct flags. | 1133 | * work_struct flags. |
| 1150 | * | 1134 | * |
| 1151 | * CONTEXT: | 1135 | * CONTEXT: |
| 1152 | * spin_lock_irq(gcwq->lock). | 1136 | * spin_lock_irq(pool->lock). |
| 1153 | */ | 1137 | */ |
| 1154 | static void insert_work(struct cpu_workqueue_struct *cwq, | 1138 | static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, |
| 1155 | struct work_struct *work, struct list_head *head, | 1139 | struct list_head *head, unsigned int extra_flags) |
| 1156 | unsigned int extra_flags) | ||
| 1157 | { | 1140 | { |
| 1158 | struct worker_pool *pool = cwq->pool; | 1141 | struct worker_pool *pool = pwq->pool; |
| 1159 | 1142 | ||
| 1160 | /* we own @work, set data and link */ | 1143 | /* we own @work, set data and link */ |
| 1161 | set_work_cwq(work, cwq, extra_flags); | 1144 | set_work_pwq(work, pwq, extra_flags); |
| 1162 | |||
| 1163 | /* | ||
| 1164 | * Ensure that we get the right work->data if we see the | ||
| 1165 | * result of list_add() below, see try_to_grab_pending(). | ||
| 1166 | */ | ||
| 1167 | smp_wmb(); | ||
| 1168 | |||
| 1169 | list_add_tail(&work->entry, head); | 1145 | list_add_tail(&work->entry, head); |
| 1170 | 1146 | ||
| 1171 | /* | 1147 | /* |
| @@ -1181,41 +1157,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
| 1181 | 1157 | ||
| 1182 | /* | 1158 | /* |
| 1183 | * Test whether @work is being queued from another work executing on the | 1159 | * Test whether @work is being queued from another work executing on the |
| 1184 | * same workqueue. This is rather expensive and should only be used from | 1160 | * same workqueue. |
| 1185 | * cold paths. | ||
| 1186 | */ | 1161 | */ |
| 1187 | static bool is_chained_work(struct workqueue_struct *wq) | 1162 | static bool is_chained_work(struct workqueue_struct *wq) |
| 1188 | { | 1163 | { |
| 1189 | unsigned long flags; | 1164 | struct worker *worker; |
| 1190 | unsigned int cpu; | ||
| 1191 | |||
| 1192 | for_each_gcwq_cpu(cpu) { | ||
| 1193 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 1194 | struct worker *worker; | ||
| 1195 | struct hlist_node *pos; | ||
| 1196 | int i; | ||
| 1197 | 1165 | ||
| 1198 | spin_lock_irqsave(&gcwq->lock, flags); | 1166 | worker = current_wq_worker(); |
| 1199 | for_each_busy_worker(worker, i, pos, gcwq) { | 1167 | /* |
| 1200 | if (worker->task != current) | 1168 | * Return %true iff I'm a worker execuing a work item on @wq. If |
| 1201 | continue; | 1169 | * I'm @worker, it's safe to dereference it without locking. |
| 1202 | spin_unlock_irqrestore(&gcwq->lock, flags); | 1170 | */ |
| 1203 | /* | 1171 | return worker && worker->current_pwq->wq == wq; |
| 1204 | * I'm @worker, no locking necessary. See if @work | ||
| 1205 | * is headed to the same workqueue. | ||
| 1206 | */ | ||
| 1207 | return worker->current_cwq->wq == wq; | ||
| 1208 | } | ||
| 1209 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
| 1210 | } | ||
| 1211 | return false; | ||
| 1212 | } | 1172 | } |
| 1213 | 1173 | ||
| 1214 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | 1174 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
| 1215 | struct work_struct *work) | 1175 | struct work_struct *work) |
| 1216 | { | 1176 | { |
| 1217 | struct global_cwq *gcwq; | 1177 | struct pool_workqueue *pwq; |
| 1218 | struct cpu_workqueue_struct *cwq; | ||
| 1219 | struct list_head *worklist; | 1178 | struct list_head *worklist; |
| 1220 | unsigned int work_flags; | 1179 | unsigned int work_flags; |
| 1221 | unsigned int req_cpu = cpu; | 1180 | unsigned int req_cpu = cpu; |
| @@ -1235,9 +1194,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 1235 | WARN_ON_ONCE(!is_chained_work(wq))) | 1194 | WARN_ON_ONCE(!is_chained_work(wq))) |
| 1236 | return; | 1195 | return; |
| 1237 | 1196 | ||
| 1238 | /* determine gcwq to use */ | 1197 | /* determine the pwq to use */ |
| 1239 | if (!(wq->flags & WQ_UNBOUND)) { | 1198 | if (!(wq->flags & WQ_UNBOUND)) { |
| 1240 | struct global_cwq *last_gcwq; | 1199 | struct worker_pool *last_pool; |
| 1241 | 1200 | ||
| 1242 | if (cpu == WORK_CPU_UNBOUND) | 1201 | if (cpu == WORK_CPU_UNBOUND) |
| 1243 | cpu = raw_smp_processor_id(); | 1202 | cpu = raw_smp_processor_id(); |
| @@ -1248,55 +1207,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 1248 | * work needs to be queued on that cpu to guarantee | 1207 | * work needs to be queued on that cpu to guarantee |
| 1249 | * non-reentrancy. | 1208 | * non-reentrancy. |
| 1250 | */ | 1209 | */ |
| 1251 | gcwq = get_gcwq(cpu); | 1210 | pwq = get_pwq(cpu, wq); |
| 1252 | last_gcwq = get_work_gcwq(work); | 1211 | last_pool = get_work_pool(work); |
| 1253 | 1212 | ||
| 1254 | if (last_gcwq && last_gcwq != gcwq) { | 1213 | if (last_pool && last_pool != pwq->pool) { |
| 1255 | struct worker *worker; | 1214 | struct worker *worker; |
| 1256 | 1215 | ||
| 1257 | spin_lock(&last_gcwq->lock); | 1216 | spin_lock(&last_pool->lock); |
| 1258 | 1217 | ||
| 1259 | worker = find_worker_executing_work(last_gcwq, work); | 1218 | worker = find_worker_executing_work(last_pool, work); |
| 1260 | 1219 | ||
| 1261 | if (worker && worker->current_cwq->wq == wq) | 1220 | if (worker && worker->current_pwq->wq == wq) { |
| 1262 | gcwq = last_gcwq; | 1221 | pwq = get_pwq(last_pool->cpu, wq); |
| 1263 | else { | 1222 | } else { |
| 1264 | /* meh... not running there, queue here */ | 1223 | /* meh... not running there, queue here */ |
| 1265 | spin_unlock(&last_gcwq->lock); | 1224 | spin_unlock(&last_pool->lock); |
| 1266 | spin_lock(&gcwq->lock); | 1225 | spin_lock(&pwq->pool->lock); |
| 1267 | } | 1226 | } |
| 1268 | } else { | 1227 | } else { |
| 1269 | spin_lock(&gcwq->lock); | 1228 | spin_lock(&pwq->pool->lock); |
| 1270 | } | 1229 | } |
| 1271 | } else { | 1230 | } else { |
| 1272 | gcwq = get_gcwq(WORK_CPU_UNBOUND); | 1231 | pwq = get_pwq(WORK_CPU_UNBOUND, wq); |
| 1273 | spin_lock(&gcwq->lock); | 1232 | spin_lock(&pwq->pool->lock); |
| 1274 | } | 1233 | } |
| 1275 | 1234 | ||
| 1276 | /* gcwq determined, get cwq and queue */ | 1235 | /* pwq determined, queue */ |
| 1277 | cwq = get_cwq(gcwq->cpu, wq); | 1236 | trace_workqueue_queue_work(req_cpu, pwq, work); |
| 1278 | trace_workqueue_queue_work(req_cpu, cwq, work); | ||
| 1279 | 1237 | ||
| 1280 | if (WARN_ON(!list_empty(&work->entry))) { | 1238 | if (WARN_ON(!list_empty(&work->entry))) { |
| 1281 | spin_unlock(&gcwq->lock); | 1239 | spin_unlock(&pwq->pool->lock); |
| 1282 | return; | 1240 | return; |
| 1283 | } | 1241 | } |
| 1284 | 1242 | ||
| 1285 | cwq->nr_in_flight[cwq->work_color]++; | 1243 | pwq->nr_in_flight[pwq->work_color]++; |
| 1286 | work_flags = work_color_to_flags(cwq->work_color); | 1244 | work_flags = work_color_to_flags(pwq->work_color); |
| 1287 | 1245 | ||
| 1288 | if (likely(cwq->nr_active < cwq->max_active)) { | 1246 | if (likely(pwq->nr_active < pwq->max_active)) { |
| 1289 | trace_workqueue_activate_work(work); | 1247 | trace_workqueue_activate_work(work); |
| 1290 | cwq->nr_active++; | 1248 | pwq->nr_active++; |
| 1291 | worklist = &cwq->pool->worklist; | 1249 | worklist = &pwq->pool->worklist; |
| 1292 | } else { | 1250 | } else { |
| 1293 | work_flags |= WORK_STRUCT_DELAYED; | 1251 | work_flags |= WORK_STRUCT_DELAYED; |
| 1294 | worklist = &cwq->delayed_works; | 1252 | worklist = &pwq->delayed_works; |
| 1295 | } | 1253 | } |
| 1296 | 1254 | ||
| 1297 | insert_work(cwq, work, worklist, work_flags); | 1255 | insert_work(pwq, work, worklist, work_flags); |
| 1298 | 1256 | ||
| 1299 | spin_unlock(&gcwq->lock); | 1257 | spin_unlock(&pwq->pool->lock); |
| 1300 | } | 1258 | } |
| 1301 | 1259 | ||
| 1302 | /** | 1260 | /** |
| @@ -1347,19 +1305,17 @@ EXPORT_SYMBOL_GPL(queue_work); | |||
| 1347 | void delayed_work_timer_fn(unsigned long __data) | 1305 | void delayed_work_timer_fn(unsigned long __data) |
| 1348 | { | 1306 | { |
| 1349 | struct delayed_work *dwork = (struct delayed_work *)__data; | 1307 | struct delayed_work *dwork = (struct delayed_work *)__data; |
| 1350 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); | ||
| 1351 | 1308 | ||
| 1352 | /* should have been called from irqsafe timer with irq already off */ | 1309 | /* should have been called from irqsafe timer with irq already off */ |
| 1353 | __queue_work(dwork->cpu, cwq->wq, &dwork->work); | 1310 | __queue_work(dwork->cpu, dwork->wq, &dwork->work); |
| 1354 | } | 1311 | } |
| 1355 | EXPORT_SYMBOL_GPL(delayed_work_timer_fn); | 1312 | EXPORT_SYMBOL(delayed_work_timer_fn); |
| 1356 | 1313 | ||
| 1357 | static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | 1314 | static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, |
| 1358 | struct delayed_work *dwork, unsigned long delay) | 1315 | struct delayed_work *dwork, unsigned long delay) |
| 1359 | { | 1316 | { |
| 1360 | struct timer_list *timer = &dwork->timer; | 1317 | struct timer_list *timer = &dwork->timer; |
| 1361 | struct work_struct *work = &dwork->work; | 1318 | struct work_struct *work = &dwork->work; |
| 1362 | unsigned int lcpu; | ||
| 1363 | 1319 | ||
| 1364 | WARN_ON_ONCE(timer->function != delayed_work_timer_fn || | 1320 | WARN_ON_ONCE(timer->function != delayed_work_timer_fn || |
| 1365 | timer->data != (unsigned long)dwork); | 1321 | timer->data != (unsigned long)dwork); |
| @@ -1379,30 +1335,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | |||
| 1379 | 1335 | ||
| 1380 | timer_stats_timer_set_start_info(&dwork->timer); | 1336 | timer_stats_timer_set_start_info(&dwork->timer); |
| 1381 | 1337 | ||
| 1382 | /* | 1338 | dwork->wq = wq; |
| 1383 | * This stores cwq for the moment, for the timer_fn. Note that the | ||
| 1384 | * work's gcwq is preserved to allow reentrance detection for | ||
| 1385 | * delayed works. | ||
| 1386 | */ | ||
| 1387 | if (!(wq->flags & WQ_UNBOUND)) { | ||
| 1388 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 1389 | |||
| 1390 | /* | ||
| 1391 | * If we cannot get the last gcwq from @work directly, | ||
| 1392 | * select the last CPU such that it avoids unnecessarily | ||
| 1393 | * triggering non-reentrancy check in __queue_work(). | ||
| 1394 | */ | ||
| 1395 | lcpu = cpu; | ||
| 1396 | if (gcwq) | ||
| 1397 | lcpu = gcwq->cpu; | ||
| 1398 | if (lcpu == WORK_CPU_UNBOUND) | ||
| 1399 | lcpu = raw_smp_processor_id(); | ||
| 1400 | } else { | ||
| 1401 | lcpu = WORK_CPU_UNBOUND; | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
| 1405 | |||
| 1406 | dwork->cpu = cpu; | 1339 | dwork->cpu = cpu; |
| 1407 | timer->expires = jiffies + delay; | 1340 | timer->expires = jiffies + delay; |
| 1408 | 1341 | ||
| @@ -1519,12 +1452,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work); | |||
| 1519 | * necessary. | 1452 | * necessary. |
| 1520 | * | 1453 | * |
| 1521 | * LOCKING: | 1454 | * LOCKING: |
| 1522 | * spin_lock_irq(gcwq->lock). | 1455 | * spin_lock_irq(pool->lock). |
| 1523 | */ | 1456 | */ |
| 1524 | static void worker_enter_idle(struct worker *worker) | 1457 | static void worker_enter_idle(struct worker *worker) |
| 1525 | { | 1458 | { |
| 1526 | struct worker_pool *pool = worker->pool; | 1459 | struct worker_pool *pool = worker->pool; |
| 1527 | struct global_cwq *gcwq = pool->gcwq; | ||
| 1528 | 1460 | ||
| 1529 | BUG_ON(worker->flags & WORKER_IDLE); | 1461 | BUG_ON(worker->flags & WORKER_IDLE); |
| 1530 | BUG_ON(!list_empty(&worker->entry) && | 1462 | BUG_ON(!list_empty(&worker->entry) && |
| @@ -1542,14 +1474,14 @@ static void worker_enter_idle(struct worker *worker) | |||
| 1542 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); | 1474 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
| 1543 | 1475 | ||
| 1544 | /* | 1476 | /* |
| 1545 | * Sanity check nr_running. Because gcwq_unbind_fn() releases | 1477 | * Sanity check nr_running. Because wq_unbind_fn() releases |
| 1546 | * gcwq->lock between setting %WORKER_UNBOUND and zapping | 1478 | * pool->lock between setting %WORKER_UNBOUND and zapping |
| 1547 | * nr_running, the warning may trigger spuriously. Check iff | 1479 | * nr_running, the warning may trigger spuriously. Check iff |
| 1548 | * unbind is not in progress. | 1480 | * unbind is not in progress. |
| 1549 | */ | 1481 | */ |
| 1550 | WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && | 1482 | WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && |
| 1551 | pool->nr_workers == pool->nr_idle && | 1483 | pool->nr_workers == pool->nr_idle && |
| 1552 | atomic_read(get_pool_nr_running(pool))); | 1484 | atomic_read(&pool->nr_running)); |
| 1553 | } | 1485 | } |
| 1554 | 1486 | ||
| 1555 | /** | 1487 | /** |
| @@ -1559,7 +1491,7 @@ static void worker_enter_idle(struct worker *worker) | |||
| 1559 | * @worker is leaving idle state. Update stats. | 1491 | * @worker is leaving idle state. Update stats. |
| 1560 | * | 1492 | * |
| 1561 | * LOCKING: | 1493 | * LOCKING: |
| 1562 | * spin_lock_irq(gcwq->lock). | 1494 | * spin_lock_irq(pool->lock). |
| 1563 | */ | 1495 | */ |
| 1564 | static void worker_leave_idle(struct worker *worker) | 1496 | static void worker_leave_idle(struct worker *worker) |
| 1565 | { | 1497 | { |
| @@ -1572,7 +1504,7 @@ static void worker_leave_idle(struct worker *worker) | |||
| 1572 | } | 1504 | } |
| 1573 | 1505 | ||
| 1574 | /** | 1506 | /** |
| 1575 | * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq | 1507 | * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool |
| 1576 | * @worker: self | 1508 | * @worker: self |
| 1577 | * | 1509 | * |
| 1578 | * Works which are scheduled while the cpu is online must at least be | 1510 | * Works which are scheduled while the cpu is online must at least be |
| @@ -1584,27 +1516,27 @@ static void worker_leave_idle(struct worker *worker) | |||
| 1584 | * themselves to the target cpu and may race with cpu going down or | 1516 | * themselves to the target cpu and may race with cpu going down or |
| 1585 | * coming online. kthread_bind() can't be used because it may put the | 1517 | * coming online. kthread_bind() can't be used because it may put the |
| 1586 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | 1518 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used |
| 1587 | * verbatim as it's best effort and blocking and gcwq may be | 1519 | * verbatim as it's best effort and blocking and pool may be |
| 1588 | * [dis]associated in the meantime. | 1520 | * [dis]associated in the meantime. |
| 1589 | * | 1521 | * |
| 1590 | * This function tries set_cpus_allowed() and locks gcwq and verifies the | 1522 | * This function tries set_cpus_allowed() and locks pool and verifies the |
| 1591 | * binding against %GCWQ_DISASSOCIATED which is set during | 1523 | * binding against %POOL_DISASSOCIATED which is set during |
| 1592 | * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker | 1524 | * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker |
| 1593 | * enters idle state or fetches works without dropping lock, it can | 1525 | * enters idle state or fetches works without dropping lock, it can |
| 1594 | * guarantee the scheduling requirement described in the first paragraph. | 1526 | * guarantee the scheduling requirement described in the first paragraph. |
| 1595 | * | 1527 | * |
| 1596 | * CONTEXT: | 1528 | * CONTEXT: |
| 1597 | * Might sleep. Called without any lock but returns with gcwq->lock | 1529 | * Might sleep. Called without any lock but returns with pool->lock |
| 1598 | * held. | 1530 | * held. |
| 1599 | * | 1531 | * |
| 1600 | * RETURNS: | 1532 | * RETURNS: |
| 1601 | * %true if the associated gcwq is online (@worker is successfully | 1533 | * %true if the associated pool is online (@worker is successfully |
| 1602 | * bound), %false if offline. | 1534 | * bound), %false if offline. |
| 1603 | */ | 1535 | */ |
| 1604 | static bool worker_maybe_bind_and_lock(struct worker *worker) | 1536 | static bool worker_maybe_bind_and_lock(struct worker *worker) |
| 1605 | __acquires(&gcwq->lock) | 1537 | __acquires(&pool->lock) |
| 1606 | { | 1538 | { |
| 1607 | struct global_cwq *gcwq = worker->pool->gcwq; | 1539 | struct worker_pool *pool = worker->pool; |
| 1608 | struct task_struct *task = worker->task; | 1540 | struct task_struct *task = worker->task; |
| 1609 | 1541 | ||
| 1610 | while (true) { | 1542 | while (true) { |
| @@ -1612,19 +1544,19 @@ __acquires(&gcwq->lock) | |||
| 1612 | * The following call may fail, succeed or succeed | 1544 | * The following call may fail, succeed or succeed |
| 1613 | * without actually migrating the task to the cpu if | 1545 | * without actually migrating the task to the cpu if |
| 1614 | * it races with cpu hotunplug operation. Verify | 1546 | * it races with cpu hotunplug operation. Verify |
| 1615 | * against GCWQ_DISASSOCIATED. | 1547 | * against POOL_DISASSOCIATED. |
| 1616 | */ | 1548 | */ |
| 1617 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) | 1549 | if (!(pool->flags & POOL_DISASSOCIATED)) |
| 1618 | set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); | 1550 | set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); |
| 1619 | 1551 | ||
| 1620 | spin_lock_irq(&gcwq->lock); | 1552 | spin_lock_irq(&pool->lock); |
| 1621 | if (gcwq->flags & GCWQ_DISASSOCIATED) | 1553 | if (pool->flags & POOL_DISASSOCIATED) |
| 1622 | return false; | 1554 | return false; |
| 1623 | if (task_cpu(task) == gcwq->cpu && | 1555 | if (task_cpu(task) == pool->cpu && |
| 1624 | cpumask_equal(¤t->cpus_allowed, | 1556 | cpumask_equal(¤t->cpus_allowed, |
| 1625 | get_cpu_mask(gcwq->cpu))) | 1557 | get_cpu_mask(pool->cpu))) |
| 1626 | return true; | 1558 | return true; |
| 1627 | spin_unlock_irq(&gcwq->lock); | 1559 | spin_unlock_irq(&pool->lock); |
| 1628 | 1560 | ||
| 1629 | /* | 1561 | /* |
| 1630 | * We've raced with CPU hot[un]plug. Give it a breather | 1562 | * We've raced with CPU hot[un]plug. Give it a breather |
| @@ -1643,15 +1575,13 @@ __acquires(&gcwq->lock) | |||
| 1643 | */ | 1575 | */ |
| 1644 | static void idle_worker_rebind(struct worker *worker) | 1576 | static void idle_worker_rebind(struct worker *worker) |
| 1645 | { | 1577 | { |
| 1646 | struct global_cwq *gcwq = worker->pool->gcwq; | ||
| 1647 | |||
| 1648 | /* CPU may go down again inbetween, clear UNBOUND only on success */ | 1578 | /* CPU may go down again inbetween, clear UNBOUND only on success */ |
| 1649 | if (worker_maybe_bind_and_lock(worker)) | 1579 | if (worker_maybe_bind_and_lock(worker)) |
| 1650 | worker_clr_flags(worker, WORKER_UNBOUND); | 1580 | worker_clr_flags(worker, WORKER_UNBOUND); |
| 1651 | 1581 | ||
| 1652 | /* rebind complete, become available again */ | 1582 | /* rebind complete, become available again */ |
| 1653 | list_add(&worker->entry, &worker->pool->idle_list); | 1583 | list_add(&worker->entry, &worker->pool->idle_list); |
| 1654 | spin_unlock_irq(&gcwq->lock); | 1584 | spin_unlock_irq(&worker->pool->lock); |
| 1655 | } | 1585 | } |
| 1656 | 1586 | ||
| 1657 | /* | 1587 | /* |
| @@ -1663,19 +1593,18 @@ static void idle_worker_rebind(struct worker *worker) | |||
| 1663 | static void busy_worker_rebind_fn(struct work_struct *work) | 1593 | static void busy_worker_rebind_fn(struct work_struct *work) |
| 1664 | { | 1594 | { |
| 1665 | struct worker *worker = container_of(work, struct worker, rebind_work); | 1595 | struct worker *worker = container_of(work, struct worker, rebind_work); |
| 1666 | struct global_cwq *gcwq = worker->pool->gcwq; | ||
| 1667 | 1596 | ||
| 1668 | if (worker_maybe_bind_and_lock(worker)) | 1597 | if (worker_maybe_bind_and_lock(worker)) |
| 1669 | worker_clr_flags(worker, WORKER_UNBOUND); | 1598 | worker_clr_flags(worker, WORKER_UNBOUND); |
| 1670 | 1599 | ||
| 1671 | spin_unlock_irq(&gcwq->lock); | 1600 | spin_unlock_irq(&worker->pool->lock); |
| 1672 | } | 1601 | } |
| 1673 | 1602 | ||
| 1674 | /** | 1603 | /** |
| 1675 | * rebind_workers - rebind all workers of a gcwq to the associated CPU | 1604 | * rebind_workers - rebind all workers of a pool to the associated CPU |
| 1676 | * @gcwq: gcwq of interest | 1605 | * @pool: pool of interest |
| 1677 | * | 1606 | * |
| 1678 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding | 1607 | * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding |
| 1679 | * is different for idle and busy ones. | 1608 | * is different for idle and busy ones. |
| 1680 | * | 1609 | * |
| 1681 | * Idle ones will be removed from the idle_list and woken up. They will | 1610 | * Idle ones will be removed from the idle_list and woken up. They will |
| @@ -1693,38 +1622,31 @@ static void busy_worker_rebind_fn(struct work_struct *work) | |||
| 1693 | * including the manager will not appear on @idle_list until rebind is | 1622 | * including the manager will not appear on @idle_list until rebind is |
| 1694 | * complete, making local wake-ups safe. | 1623 | * complete, making local wake-ups safe. |
| 1695 | */ | 1624 | */ |
| 1696 | static void rebind_workers(struct global_cwq *gcwq) | 1625 | static void rebind_workers(struct worker_pool *pool) |
| 1697 | { | 1626 | { |
| 1698 | struct worker_pool *pool; | ||
| 1699 | struct worker *worker, *n; | 1627 | struct worker *worker, *n; |
| 1700 | struct hlist_node *pos; | ||
| 1701 | int i; | 1628 | int i; |
| 1702 | 1629 | ||
| 1703 | lockdep_assert_held(&gcwq->lock); | 1630 | lockdep_assert_held(&pool->assoc_mutex); |
| 1704 | 1631 | lockdep_assert_held(&pool->lock); | |
| 1705 | for_each_worker_pool(pool, gcwq) | ||
| 1706 | lockdep_assert_held(&pool->assoc_mutex); | ||
| 1707 | 1632 | ||
| 1708 | /* dequeue and kick idle ones */ | 1633 | /* dequeue and kick idle ones */ |
| 1709 | for_each_worker_pool(pool, gcwq) { | 1634 | list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { |
| 1710 | list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { | 1635 | /* |
| 1711 | /* | 1636 | * idle workers should be off @pool->idle_list until rebind |
| 1712 | * idle workers should be off @pool->idle_list | 1637 | * is complete to avoid receiving premature local wake-ups. |
| 1713 | * until rebind is complete to avoid receiving | 1638 | */ |
| 1714 | * premature local wake-ups. | 1639 | list_del_init(&worker->entry); |
| 1715 | */ | ||
| 1716 | list_del_init(&worker->entry); | ||
| 1717 | 1640 | ||
| 1718 | /* | 1641 | /* |
| 1719 | * worker_thread() will see the above dequeuing | 1642 | * worker_thread() will see the above dequeuing and call |
| 1720 | * and call idle_worker_rebind(). | 1643 | * idle_worker_rebind(). |
| 1721 | */ | 1644 | */ |
| 1722 | wake_up_process(worker->task); | 1645 | wake_up_process(worker->task); |
| 1723 | } | ||
| 1724 | } | 1646 | } |
| 1725 | 1647 | ||
| 1726 | /* rebind busy workers */ | 1648 | /* rebind busy workers */ |
| 1727 | for_each_busy_worker(worker, i, pos, gcwq) { | 1649 | for_each_busy_worker(worker, i, pool) { |
| 1728 | struct work_struct *rebind_work = &worker->rebind_work; | 1650 | struct work_struct *rebind_work = &worker->rebind_work; |
| 1729 | struct workqueue_struct *wq; | 1651 | struct workqueue_struct *wq; |
| 1730 | 1652 | ||
| @@ -1736,16 +1658,16 @@ static void rebind_workers(struct global_cwq *gcwq) | |||
| 1736 | 1658 | ||
| 1737 | /* | 1659 | /* |
| 1738 | * wq doesn't really matter but let's keep @worker->pool | 1660 | * wq doesn't really matter but let's keep @worker->pool |
| 1739 | * and @cwq->pool consistent for sanity. | 1661 | * and @pwq->pool consistent for sanity. |
| 1740 | */ | 1662 | */ |
| 1741 | if (worker_pool_pri(worker->pool)) | 1663 | if (std_worker_pool_pri(worker->pool)) |
| 1742 | wq = system_highpri_wq; | 1664 | wq = system_highpri_wq; |
| 1743 | else | 1665 | else |
| 1744 | wq = system_wq; | 1666 | wq = system_wq; |
| 1745 | 1667 | ||
| 1746 | insert_work(get_cwq(gcwq->cpu, wq), rebind_work, | 1668 | insert_work(get_pwq(pool->cpu, wq), rebind_work, |
| 1747 | worker->scheduled.next, | 1669 | worker->scheduled.next, |
| 1748 | work_color_to_flags(WORK_NO_COLOR)); | 1670 | work_color_to_flags(WORK_NO_COLOR)); |
| 1749 | } | 1671 | } |
| 1750 | } | 1672 | } |
| 1751 | 1673 | ||
| @@ -1780,19 +1702,18 @@ static struct worker *alloc_worker(void) | |||
| 1780 | */ | 1702 | */ |
| 1781 | static struct worker *create_worker(struct worker_pool *pool) | 1703 | static struct worker *create_worker(struct worker_pool *pool) |
| 1782 | { | 1704 | { |
| 1783 | struct global_cwq *gcwq = pool->gcwq; | 1705 | const char *pri = std_worker_pool_pri(pool) ? "H" : ""; |
| 1784 | const char *pri = worker_pool_pri(pool) ? "H" : ""; | ||
| 1785 | struct worker *worker = NULL; | 1706 | struct worker *worker = NULL; |
| 1786 | int id = -1; | 1707 | int id = -1; |
| 1787 | 1708 | ||
| 1788 | spin_lock_irq(&gcwq->lock); | 1709 | spin_lock_irq(&pool->lock); |
| 1789 | while (ida_get_new(&pool->worker_ida, &id)) { | 1710 | while (ida_get_new(&pool->worker_ida, &id)) { |
| 1790 | spin_unlock_irq(&gcwq->lock); | 1711 | spin_unlock_irq(&pool->lock); |
| 1791 | if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) | 1712 | if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) |
| 1792 | goto fail; | 1713 | goto fail; |
| 1793 | spin_lock_irq(&gcwq->lock); | 1714 | spin_lock_irq(&pool->lock); |
| 1794 | } | 1715 | } |
| 1795 | spin_unlock_irq(&gcwq->lock); | 1716 | spin_unlock_irq(&pool->lock); |
| 1796 | 1717 | ||
| 1797 | worker = alloc_worker(); | 1718 | worker = alloc_worker(); |
| 1798 | if (!worker) | 1719 | if (!worker) |
| @@ -1801,30 +1722,30 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
| 1801 | worker->pool = pool; | 1722 | worker->pool = pool; |
| 1802 | worker->id = id; | 1723 | worker->id = id; |
| 1803 | 1724 | ||
| 1804 | if (gcwq->cpu != WORK_CPU_UNBOUND) | 1725 | if (pool->cpu != WORK_CPU_UNBOUND) |
| 1805 | worker->task = kthread_create_on_node(worker_thread, | 1726 | worker->task = kthread_create_on_node(worker_thread, |
| 1806 | worker, cpu_to_node(gcwq->cpu), | 1727 | worker, cpu_to_node(pool->cpu), |
| 1807 | "kworker/%u:%d%s", gcwq->cpu, id, pri); | 1728 | "kworker/%u:%d%s", pool->cpu, id, pri); |
| 1808 | else | 1729 | else |
| 1809 | worker->task = kthread_create(worker_thread, worker, | 1730 | worker->task = kthread_create(worker_thread, worker, |
| 1810 | "kworker/u:%d%s", id, pri); | 1731 | "kworker/u:%d%s", id, pri); |
| 1811 | if (IS_ERR(worker->task)) | 1732 | if (IS_ERR(worker->task)) |
| 1812 | goto fail; | 1733 | goto fail; |
| 1813 | 1734 | ||
| 1814 | if (worker_pool_pri(pool)) | 1735 | if (std_worker_pool_pri(pool)) |
| 1815 | set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); | 1736 | set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); |
| 1816 | 1737 | ||
| 1817 | /* | 1738 | /* |
| 1818 | * Determine CPU binding of the new worker depending on | 1739 | * Determine CPU binding of the new worker depending on |
| 1819 | * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the | 1740 | * %POOL_DISASSOCIATED. The caller is responsible for ensuring the |
| 1820 | * flag remains stable across this function. See the comments | 1741 | * flag remains stable across this function. See the comments |
| 1821 | * above the flag definition for details. | 1742 | * above the flag definition for details. |
| 1822 | * | 1743 | * |
| 1823 | * As an unbound worker may later become a regular one if CPU comes | 1744 | * As an unbound worker may later become a regular one if CPU comes |
| 1824 | * online, make sure every worker has %PF_THREAD_BOUND set. | 1745 | * online, make sure every worker has %PF_THREAD_BOUND set. |
| 1825 | */ | 1746 | */ |
| 1826 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { | 1747 | if (!(pool->flags & POOL_DISASSOCIATED)) { |
| 1827 | kthread_bind(worker->task, gcwq->cpu); | 1748 | kthread_bind(worker->task, pool->cpu); |
| 1828 | } else { | 1749 | } else { |
| 1829 | worker->task->flags |= PF_THREAD_BOUND; | 1750 | worker->task->flags |= PF_THREAD_BOUND; |
| 1830 | worker->flags |= WORKER_UNBOUND; | 1751 | worker->flags |= WORKER_UNBOUND; |
| @@ -1833,9 +1754,9 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
| 1833 | return worker; | 1754 | return worker; |
| 1834 | fail: | 1755 | fail: |
| 1835 | if (id >= 0) { | 1756 | if (id >= 0) { |
| 1836 | spin_lock_irq(&gcwq->lock); | 1757 | spin_lock_irq(&pool->lock); |
| 1837 | ida_remove(&pool->worker_ida, id); | 1758 | ida_remove(&pool->worker_ida, id); |
| 1838 | spin_unlock_irq(&gcwq->lock); | 1759 | spin_unlock_irq(&pool->lock); |
| 1839 | } | 1760 | } |
| 1840 | kfree(worker); | 1761 | kfree(worker); |
| 1841 | return NULL; | 1762 | return NULL; |
| @@ -1845,10 +1766,10 @@ fail: | |||
| 1845 | * start_worker - start a newly created worker | 1766 | * start_worker - start a newly created worker |
| 1846 | * @worker: worker to start | 1767 | * @worker: worker to start |
| 1847 | * | 1768 | * |
| 1848 | * Make the gcwq aware of @worker and start it. | 1769 | * Make the pool aware of @worker and start it. |
| 1849 | * | 1770 | * |
| 1850 | * CONTEXT: | 1771 | * CONTEXT: |
| 1851 | * spin_lock_irq(gcwq->lock). | 1772 | * spin_lock_irq(pool->lock). |
| 1852 | */ | 1773 | */ |
| 1853 | static void start_worker(struct worker *worker) | 1774 | static void start_worker(struct worker *worker) |
| 1854 | { | 1775 | { |
| @@ -1862,15 +1783,14 @@ static void start_worker(struct worker *worker) | |||
| 1862 | * destroy_worker - destroy a workqueue worker | 1783 | * destroy_worker - destroy a workqueue worker |
| 1863 | * @worker: worker to be destroyed | 1784 | * @worker: worker to be destroyed |
| 1864 | * | 1785 | * |
| 1865 | * Destroy @worker and adjust @gcwq stats accordingly. | 1786 | * Destroy @worker and adjust @pool stats accordingly. |
| 1866 | * | 1787 | * |
| 1867 | * CONTEXT: | 1788 | * CONTEXT: |
| 1868 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | 1789 | * spin_lock_irq(pool->lock) which is released and regrabbed. |
| 1869 | */ | 1790 | */ |
| 1870 | static void destroy_worker(struct worker *worker) | 1791 | static void destroy_worker(struct worker *worker) |
| 1871 | { | 1792 | { |
| 1872 | struct worker_pool *pool = worker->pool; | 1793 | struct worker_pool *pool = worker->pool; |
| 1873 | struct global_cwq *gcwq = pool->gcwq; | ||
| 1874 | int id = worker->id; | 1794 | int id = worker->id; |
| 1875 | 1795 | ||
| 1876 | /* sanity check frenzy */ | 1796 | /* sanity check frenzy */ |
| @@ -1885,21 +1805,20 @@ static void destroy_worker(struct worker *worker) | |||
| 1885 | list_del_init(&worker->entry); | 1805 | list_del_init(&worker->entry); |
| 1886 | worker->flags |= WORKER_DIE; | 1806 | worker->flags |= WORKER_DIE; |
| 1887 | 1807 | ||
| 1888 | spin_unlock_irq(&gcwq->lock); | 1808 | spin_unlock_irq(&pool->lock); |
| 1889 | 1809 | ||
| 1890 | kthread_stop(worker->task); | 1810 | kthread_stop(worker->task); |
| 1891 | kfree(worker); | 1811 | kfree(worker); |
| 1892 | 1812 | ||
| 1893 | spin_lock_irq(&gcwq->lock); | 1813 | spin_lock_irq(&pool->lock); |
| 1894 | ida_remove(&pool->worker_ida, id); | 1814 | ida_remove(&pool->worker_ida, id); |
| 1895 | } | 1815 | } |
| 1896 | 1816 | ||
| 1897 | static void idle_worker_timeout(unsigned long __pool) | 1817 | static void idle_worker_timeout(unsigned long __pool) |
| 1898 | { | 1818 | { |
| 1899 | struct worker_pool *pool = (void *)__pool; | 1819 | struct worker_pool *pool = (void *)__pool; |
| 1900 | struct global_cwq *gcwq = pool->gcwq; | ||
| 1901 | 1820 | ||
| 1902 | spin_lock_irq(&gcwq->lock); | 1821 | spin_lock_irq(&pool->lock); |
| 1903 | 1822 | ||
| 1904 | if (too_many_workers(pool)) { | 1823 | if (too_many_workers(pool)) { |
| 1905 | struct worker *worker; | 1824 | struct worker *worker; |
| @@ -1918,20 +1837,20 @@ static void idle_worker_timeout(unsigned long __pool) | |||
| 1918 | } | 1837 | } |
| 1919 | } | 1838 | } |
| 1920 | 1839 | ||
| 1921 | spin_unlock_irq(&gcwq->lock); | 1840 | spin_unlock_irq(&pool->lock); |
| 1922 | } | 1841 | } |
| 1923 | 1842 | ||
| 1924 | static bool send_mayday(struct work_struct *work) | 1843 | static bool send_mayday(struct work_struct *work) |
| 1925 | { | 1844 | { |
| 1926 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | 1845 | struct pool_workqueue *pwq = get_work_pwq(work); |
| 1927 | struct workqueue_struct *wq = cwq->wq; | 1846 | struct workqueue_struct *wq = pwq->wq; |
| 1928 | unsigned int cpu; | 1847 | unsigned int cpu; |
| 1929 | 1848 | ||
| 1930 | if (!(wq->flags & WQ_RESCUER)) | 1849 | if (!(wq->flags & WQ_RESCUER)) |
| 1931 | return false; | 1850 | return false; |
| 1932 | 1851 | ||
| 1933 | /* mayday mayday mayday */ | 1852 | /* mayday mayday mayday */ |
| 1934 | cpu = cwq->pool->gcwq->cpu; | 1853 | cpu = pwq->pool->cpu; |
| 1935 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | 1854 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ |
| 1936 | if (cpu == WORK_CPU_UNBOUND) | 1855 | if (cpu == WORK_CPU_UNBOUND) |
| 1937 | cpu = 0; | 1856 | cpu = 0; |
| @@ -1940,13 +1859,12 @@ static bool send_mayday(struct work_struct *work) | |||
| 1940 | return true; | 1859 | return true; |
| 1941 | } | 1860 | } |
| 1942 | 1861 | ||
| 1943 | static void gcwq_mayday_timeout(unsigned long __pool) | 1862 | static void pool_mayday_timeout(unsigned long __pool) |
| 1944 | { | 1863 | { |
| 1945 | struct worker_pool *pool = (void *)__pool; | 1864 | struct worker_pool *pool = (void *)__pool; |
| 1946 | struct global_cwq *gcwq = pool->gcwq; | ||
| 1947 | struct work_struct *work; | 1865 | struct work_struct *work; |
| 1948 | 1866 | ||
| 1949 | spin_lock_irq(&gcwq->lock); | 1867 | spin_lock_irq(&pool->lock); |
| 1950 | 1868 | ||
| 1951 | if (need_to_create_worker(pool)) { | 1869 | if (need_to_create_worker(pool)) { |
| 1952 | /* | 1870 | /* |
| @@ -1959,7 +1877,7 @@ static void gcwq_mayday_timeout(unsigned long __pool) | |||
| 1959 | send_mayday(work); | 1877 | send_mayday(work); |
| 1960 | } | 1878 | } |
| 1961 | 1879 | ||
| 1962 | spin_unlock_irq(&gcwq->lock); | 1880 | spin_unlock_irq(&pool->lock); |
| 1963 | 1881 | ||
| 1964 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); | 1882 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
| 1965 | } | 1883 | } |
| @@ -1978,24 +1896,22 @@ static void gcwq_mayday_timeout(unsigned long __pool) | |||
| 1978 | * may_start_working() true. | 1896 | * may_start_working() true. |
| 1979 | * | 1897 | * |
| 1980 | * LOCKING: | 1898 | * LOCKING: |
| 1981 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | 1899 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 1982 | * multiple times. Does GFP_KERNEL allocations. Called only from | 1900 | * multiple times. Does GFP_KERNEL allocations. Called only from |
| 1983 | * manager. | 1901 | * manager. |
| 1984 | * | 1902 | * |
| 1985 | * RETURNS: | 1903 | * RETURNS: |
| 1986 | * false if no action was taken and gcwq->lock stayed locked, true | 1904 | * false if no action was taken and pool->lock stayed locked, true |
| 1987 | * otherwise. | 1905 | * otherwise. |
| 1988 | */ | 1906 | */ |
| 1989 | static bool maybe_create_worker(struct worker_pool *pool) | 1907 | static bool maybe_create_worker(struct worker_pool *pool) |
| 1990 | __releases(&gcwq->lock) | 1908 | __releases(&pool->lock) |
| 1991 | __acquires(&gcwq->lock) | 1909 | __acquires(&pool->lock) |
| 1992 | { | 1910 | { |
| 1993 | struct global_cwq *gcwq = pool->gcwq; | ||
| 1994 | |||
| 1995 | if (!need_to_create_worker(pool)) | 1911 | if (!need_to_create_worker(pool)) |
| 1996 | return false; | 1912 | return false; |
| 1997 | restart: | 1913 | restart: |
| 1998 | spin_unlock_irq(&gcwq->lock); | 1914 | spin_unlock_irq(&pool->lock); |
| 1999 | 1915 | ||
| 2000 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ | 1916 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ |
| 2001 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | 1917 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); |
| @@ -2006,7 +1922,7 @@ restart: | |||
| 2006 | worker = create_worker(pool); | 1922 | worker = create_worker(pool); |
| 2007 | if (worker) { | 1923 | if (worker) { |
| 2008 | del_timer_sync(&pool->mayday_timer); | 1924 | del_timer_sync(&pool->mayday_timer); |
| 2009 | spin_lock_irq(&gcwq->lock); | 1925 | spin_lock_irq(&pool->lock); |
| 2010 | start_worker(worker); | 1926 | start_worker(worker); |
| 2011 | BUG_ON(need_to_create_worker(pool)); | 1927 | BUG_ON(need_to_create_worker(pool)); |
| 2012 | return true; | 1928 | return true; |
| @@ -2023,7 +1939,7 @@ restart: | |||
| 2023 | } | 1939 | } |
| 2024 | 1940 | ||
| 2025 | del_timer_sync(&pool->mayday_timer); | 1941 | del_timer_sync(&pool->mayday_timer); |
| 2026 | spin_lock_irq(&gcwq->lock); | 1942 | spin_lock_irq(&pool->lock); |
| 2027 | if (need_to_create_worker(pool)) | 1943 | if (need_to_create_worker(pool)) |
| 2028 | goto restart; | 1944 | goto restart; |
| 2029 | return true; | 1945 | return true; |
| @@ -2037,11 +1953,11 @@ restart: | |||
| 2037 | * IDLE_WORKER_TIMEOUT. | 1953 | * IDLE_WORKER_TIMEOUT. |
| 2038 | * | 1954 | * |
| 2039 | * LOCKING: | 1955 | * LOCKING: |
| 2040 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | 1956 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 2041 | * multiple times. Called only from manager. | 1957 | * multiple times. Called only from manager. |
| 2042 | * | 1958 | * |
| 2043 | * RETURNS: | 1959 | * RETURNS: |
| 2044 | * false if no action was taken and gcwq->lock stayed locked, true | 1960 | * false if no action was taken and pool->lock stayed locked, true |
| 2045 | * otherwise. | 1961 | * otherwise. |
| 2046 | */ | 1962 | */ |
| 2047 | static bool maybe_destroy_workers(struct worker_pool *pool) | 1963 | static bool maybe_destroy_workers(struct worker_pool *pool) |
| @@ -2071,21 +1987,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool) | |||
| 2071 | * manage_workers - manage worker pool | 1987 | * manage_workers - manage worker pool |
| 2072 | * @worker: self | 1988 | * @worker: self |
| 2073 | * | 1989 | * |
| 2074 | * Assume the manager role and manage gcwq worker pool @worker belongs | 1990 | * Assume the manager role and manage the worker pool @worker belongs |
| 2075 | * to. At any given time, there can be only zero or one manager per | 1991 | * to. At any given time, there can be only zero or one manager per |
| 2076 | * gcwq. The exclusion is handled automatically by this function. | 1992 | * pool. The exclusion is handled automatically by this function. |
| 2077 | * | 1993 | * |
| 2078 | * The caller can safely start processing works on false return. On | 1994 | * The caller can safely start processing works on false return. On |
| 2079 | * true return, it's guaranteed that need_to_create_worker() is false | 1995 | * true return, it's guaranteed that need_to_create_worker() is false |
| 2080 | * and may_start_working() is true. | 1996 | * and may_start_working() is true. |
| 2081 | * | 1997 | * |
| 2082 | * CONTEXT: | 1998 | * CONTEXT: |
| 2083 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | 1999 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 2084 | * multiple times. Does GFP_KERNEL allocations. | 2000 | * multiple times. Does GFP_KERNEL allocations. |
| 2085 | * | 2001 | * |
| 2086 | * RETURNS: | 2002 | * RETURNS: |
| 2087 | * false if no action was taken and gcwq->lock stayed locked, true if | 2003 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 2088 | * some action was taken. | 2004 | * multiple times. Does GFP_KERNEL allocations. |
| 2089 | */ | 2005 | */ |
| 2090 | static bool manage_workers(struct worker *worker) | 2006 | static bool manage_workers(struct worker *worker) |
| 2091 | { | 2007 | { |
| @@ -2107,20 +2023,20 @@ static bool manage_workers(struct worker *worker) | |||
| 2107 | * manager against CPU hotplug. | 2023 | * manager against CPU hotplug. |
| 2108 | * | 2024 | * |
| 2109 | * assoc_mutex would always be free unless CPU hotplug is in | 2025 | * assoc_mutex would always be free unless CPU hotplug is in |
| 2110 | * progress. trylock first without dropping @gcwq->lock. | 2026 | * progress. trylock first without dropping @pool->lock. |
| 2111 | */ | 2027 | */ |
| 2112 | if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { | 2028 | if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { |
| 2113 | spin_unlock_irq(&pool->gcwq->lock); | 2029 | spin_unlock_irq(&pool->lock); |
| 2114 | mutex_lock(&pool->assoc_mutex); | 2030 | mutex_lock(&pool->assoc_mutex); |
| 2115 | /* | 2031 | /* |
| 2116 | * CPU hotplug could have happened while we were waiting | 2032 | * CPU hotplug could have happened while we were waiting |
| 2117 | * for assoc_mutex. Hotplug itself can't handle us | 2033 | * for assoc_mutex. Hotplug itself can't handle us |
| 2118 | * because manager isn't either on idle or busy list, and | 2034 | * because manager isn't either on idle or busy list, and |
| 2119 | * @gcwq's state and ours could have deviated. | 2035 | * @pool's state and ours could have deviated. |
| 2120 | * | 2036 | * |
| 2121 | * As hotplug is now excluded via assoc_mutex, we can | 2037 | * As hotplug is now excluded via assoc_mutex, we can |
| 2122 | * simply try to bind. It will succeed or fail depending | 2038 | * simply try to bind. It will succeed or fail depending |
| 2123 | * on @gcwq's current state. Try it and adjust | 2039 | * on @pool's current state. Try it and adjust |
| 2124 | * %WORKER_UNBOUND accordingly. | 2040 | * %WORKER_UNBOUND accordingly. |
| 2125 | */ | 2041 | */ |
| 2126 | if (worker_maybe_bind_and_lock(worker)) | 2042 | if (worker_maybe_bind_and_lock(worker)) |
| @@ -2157,18 +2073,15 @@ static bool manage_workers(struct worker *worker) | |||
| 2157 | * call this function to process a work. | 2073 | * call this function to process a work. |
| 2158 | * | 2074 | * |
| 2159 | * CONTEXT: | 2075 | * CONTEXT: |
| 2160 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | 2076 | * spin_lock_irq(pool->lock) which is released and regrabbed. |
| 2161 | */ | 2077 | */ |
| 2162 | static void process_one_work(struct worker *worker, struct work_struct *work) | 2078 | static void process_one_work(struct worker *worker, struct work_struct *work) |
| 2163 | __releases(&gcwq->lock) | 2079 | __releases(&pool->lock) |
| 2164 | __acquires(&gcwq->lock) | 2080 | __acquires(&pool->lock) |
| 2165 | { | 2081 | { |
| 2166 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | 2082 | struct pool_workqueue *pwq = get_work_pwq(work); |
| 2167 | struct worker_pool *pool = worker->pool; | 2083 | struct worker_pool *pool = worker->pool; |
| 2168 | struct global_cwq *gcwq = pool->gcwq; | 2084 | bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; |
| 2169 | struct hlist_head *bwh = busy_worker_head(gcwq, work); | ||
| 2170 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; | ||
| 2171 | work_func_t f = work->func; | ||
| 2172 | int work_color; | 2085 | int work_color; |
| 2173 | struct worker *collision; | 2086 | struct worker *collision; |
| 2174 | #ifdef CONFIG_LOCKDEP | 2087 | #ifdef CONFIG_LOCKDEP |
| @@ -2186,11 +2099,11 @@ __acquires(&gcwq->lock) | |||
| 2186 | /* | 2099 | /* |
| 2187 | * Ensure we're on the correct CPU. DISASSOCIATED test is | 2100 | * Ensure we're on the correct CPU. DISASSOCIATED test is |
| 2188 | * necessary to avoid spurious warnings from rescuers servicing the | 2101 | * necessary to avoid spurious warnings from rescuers servicing the |
| 2189 | * unbound or a disassociated gcwq. | 2102 | * unbound or a disassociated pool. |
| 2190 | */ | 2103 | */ |
| 2191 | WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && | 2104 | WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && |
| 2192 | !(gcwq->flags & GCWQ_DISASSOCIATED) && | 2105 | !(pool->flags & POOL_DISASSOCIATED) && |
| 2193 | raw_smp_processor_id() != gcwq->cpu); | 2106 | raw_smp_processor_id() != pool->cpu); |
| 2194 | 2107 | ||
| 2195 | /* | 2108 | /* |
| 2196 | * A single work shouldn't be executed concurrently by | 2109 | * A single work shouldn't be executed concurrently by |
| @@ -2198,7 +2111,7 @@ __acquires(&gcwq->lock) | |||
| 2198 | * already processing the work. If so, defer the work to the | 2111 | * already processing the work. If so, defer the work to the |
| 2199 | * currently executing one. | 2112 | * currently executing one. |
| 2200 | */ | 2113 | */ |
| 2201 | collision = __find_worker_executing_work(gcwq, bwh, work); | 2114 | collision = find_worker_executing_work(pool, work); |
| 2202 | if (unlikely(collision)) { | 2115 | if (unlikely(collision)) { |
| 2203 | move_linked_works(work, &collision->scheduled, NULL); | 2116 | move_linked_works(work, &collision->scheduled, NULL); |
| 2204 | return; | 2117 | return; |
| @@ -2206,9 +2119,10 @@ __acquires(&gcwq->lock) | |||
| 2206 | 2119 | ||
| 2207 | /* claim and dequeue */ | 2120 | /* claim and dequeue */ |
| 2208 | debug_work_deactivate(work); | 2121 | debug_work_deactivate(work); |
| 2209 | hlist_add_head(&worker->hentry, bwh); | 2122 | hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); |
| 2210 | worker->current_work = work; | 2123 | worker->current_work = work; |
| 2211 | worker->current_cwq = cwq; | 2124 | worker->current_func = work->func; |
| 2125 | worker->current_pwq = pwq; | ||
| 2212 | work_color = get_work_color(work); | 2126 | work_color = get_work_color(work); |
| 2213 | 2127 | ||
| 2214 | list_del_init(&work->entry); | 2128 | list_del_init(&work->entry); |
| @@ -2221,53 +2135,55 @@ __acquires(&gcwq->lock) | |||
| 2221 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | 2135 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); |
| 2222 | 2136 | ||
| 2223 | /* | 2137 | /* |
| 2224 | * Unbound gcwq isn't concurrency managed and work items should be | 2138 | * Unbound pool isn't concurrency managed and work items should be |
| 2225 | * executed ASAP. Wake up another worker if necessary. | 2139 | * executed ASAP. Wake up another worker if necessary. |
| 2226 | */ | 2140 | */ |
| 2227 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) | 2141 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) |
| 2228 | wake_up_worker(pool); | 2142 | wake_up_worker(pool); |
| 2229 | 2143 | ||
| 2230 | /* | 2144 | /* |
| 2231 | * Record the last CPU and clear PENDING which should be the last | 2145 | * Record the last pool and clear PENDING which should be the last |
| 2232 | * update to @work. Also, do this inside @gcwq->lock so that | 2146 | * update to @work. Also, do this inside @pool->lock so that |
| 2233 | * PENDING and queued state changes happen together while IRQ is | 2147 | * PENDING and queued state changes happen together while IRQ is |
| 2234 | * disabled. | 2148 | * disabled. |
| 2235 | */ | 2149 | */ |
| 2236 | set_work_cpu_and_clear_pending(work, gcwq->cpu); | 2150 | set_work_pool_and_clear_pending(work, pool->id); |
| 2237 | 2151 | ||
| 2238 | spin_unlock_irq(&gcwq->lock); | 2152 | spin_unlock_irq(&pool->lock); |
| 2239 | 2153 | ||
| 2240 | lock_map_acquire_read(&cwq->wq->lockdep_map); | 2154 | lock_map_acquire_read(&pwq->wq->lockdep_map); |
| 2241 | lock_map_acquire(&lockdep_map); | 2155 | lock_map_acquire(&lockdep_map); |
| 2242 | trace_workqueue_execute_start(work); | 2156 | trace_workqueue_execute_start(work); |
| 2243 | f(work); | 2157 | worker->current_func(work); |
| 2244 | /* | 2158 | /* |
| 2245 | * While we must be careful to not use "work" after this, the trace | 2159 | * While we must be careful to not use "work" after this, the trace |
| 2246 | * point will only record its address. | 2160 | * point will only record its address. |
| 2247 | */ | 2161 | */ |
| 2248 | trace_workqueue_execute_end(work); | 2162 | trace_workqueue_execute_end(work); |
| 2249 | lock_map_release(&lockdep_map); | 2163 | lock_map_release(&lockdep_map); |
| 2250 | lock_map_release(&cwq->wq->lockdep_map); | 2164 | lock_map_release(&pwq->wq->lockdep_map); |
| 2251 | 2165 | ||
| 2252 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 2166 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { |
| 2253 | pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" | 2167 | pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" |
| 2254 | " last function: %pf\n", | 2168 | " last function: %pf\n", |
| 2255 | current->comm, preempt_count(), task_pid_nr(current), f); | 2169 | current->comm, preempt_count(), task_pid_nr(current), |
| 2170 | worker->current_func); | ||
| 2256 | debug_show_held_locks(current); | 2171 | debug_show_held_locks(current); |
| 2257 | dump_stack(); | 2172 | dump_stack(); |
| 2258 | } | 2173 | } |
| 2259 | 2174 | ||
| 2260 | spin_lock_irq(&gcwq->lock); | 2175 | spin_lock_irq(&pool->lock); |
| 2261 | 2176 | ||
| 2262 | /* clear cpu intensive status */ | 2177 | /* clear cpu intensive status */ |
| 2263 | if (unlikely(cpu_intensive)) | 2178 | if (unlikely(cpu_intensive)) |
| 2264 | worker_clr_flags(worker, WORKER_CPU_INTENSIVE); | 2179 | worker_clr_flags(worker, WORKER_CPU_INTENSIVE); |
| 2265 | 2180 | ||
| 2266 | /* we're done with it, release */ | 2181 | /* we're done with it, release */ |
| 2267 | hlist_del_init(&worker->hentry); | 2182 | hash_del(&worker->hentry); |
| 2268 | worker->current_work = NULL; | 2183 | worker->current_work = NULL; |
| 2269 | worker->current_cwq = NULL; | 2184 | worker->current_func = NULL; |
| 2270 | cwq_dec_nr_in_flight(cwq, work_color); | 2185 | worker->current_pwq = NULL; |
| 2186 | pwq_dec_nr_in_flight(pwq, work_color); | ||
| 2271 | } | 2187 | } |
| 2272 | 2188 | ||
| 2273 | /** | 2189 | /** |
| @@ -2279,7 +2195,7 @@ __acquires(&gcwq->lock) | |||
| 2279 | * fetches a work from the top and executes it. | 2195 | * fetches a work from the top and executes it. |
| 2280 | * | 2196 | * |
| 2281 | * CONTEXT: | 2197 | * CONTEXT: |
| 2282 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | 2198 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 2283 | * multiple times. | 2199 | * multiple times. |
| 2284 | */ | 2200 | */ |
| 2285 | static void process_scheduled_works(struct worker *worker) | 2201 | static void process_scheduled_works(struct worker *worker) |
| @@ -2295,8 +2211,8 @@ static void process_scheduled_works(struct worker *worker) | |||
| 2295 | * worker_thread - the worker thread function | 2211 | * worker_thread - the worker thread function |
| 2296 | * @__worker: self | 2212 | * @__worker: self |
| 2297 | * | 2213 | * |
| 2298 | * The gcwq worker thread function. There's a single dynamic pool of | 2214 | * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools |
| 2299 | * these per each cpu. These workers process all works regardless of | 2215 | * of these per each cpu. These workers process all works regardless of |
| 2300 | * their specific target workqueue. The only exception is works which | 2216 | * their specific target workqueue. The only exception is works which |
| 2301 | * belong to workqueues with a rescuer which will be explained in | 2217 | * belong to workqueues with a rescuer which will be explained in |
| 2302 | * rescuer_thread(). | 2218 | * rescuer_thread(). |
| @@ -2305,16 +2221,15 @@ static int worker_thread(void *__worker) | |||
| 2305 | { | 2221 | { |
| 2306 | struct worker *worker = __worker; | 2222 | struct worker *worker = __worker; |
| 2307 | struct worker_pool *pool = worker->pool; | 2223 | struct worker_pool *pool = worker->pool; |
| 2308 | struct global_cwq *gcwq = pool->gcwq; | ||
| 2309 | 2224 | ||
| 2310 | /* tell the scheduler that this is a workqueue worker */ | 2225 | /* tell the scheduler that this is a workqueue worker */ |
| 2311 | worker->task->flags |= PF_WQ_WORKER; | 2226 | worker->task->flags |= PF_WQ_WORKER; |
| 2312 | woke_up: | 2227 | woke_up: |
| 2313 | spin_lock_irq(&gcwq->lock); | 2228 | spin_lock_irq(&pool->lock); |
| 2314 | 2229 | ||
| 2315 | /* we are off idle list if destruction or rebind is requested */ | 2230 | /* we are off idle list if destruction or rebind is requested */ |
| 2316 | if (unlikely(list_empty(&worker->entry))) { | 2231 | if (unlikely(list_empty(&worker->entry))) { |
| 2317 | spin_unlock_irq(&gcwq->lock); | 2232 | spin_unlock_irq(&pool->lock); |
| 2318 | 2233 | ||
| 2319 | /* if DIE is set, destruction is requested */ | 2234 | /* if DIE is set, destruction is requested */ |
| 2320 | if (worker->flags & WORKER_DIE) { | 2235 | if (worker->flags & WORKER_DIE) { |
| @@ -2373,52 +2288,59 @@ sleep: | |||
| 2373 | goto recheck; | 2288 | goto recheck; |
| 2374 | 2289 | ||
| 2375 | /* | 2290 | /* |
| 2376 | * gcwq->lock is held and there's no work to process and no | 2291 | * pool->lock is held and there's no work to process and no need to |
| 2377 | * need to manage, sleep. Workers are woken up only while | 2292 | * manage, sleep. Workers are woken up only while holding |
| 2378 | * holding gcwq->lock or from local cpu, so setting the | 2293 | * pool->lock or from local cpu, so setting the current state |
| 2379 | * current state before releasing gcwq->lock is enough to | 2294 | * before releasing pool->lock is enough to prevent losing any |
| 2380 | * prevent losing any event. | 2295 | * event. |
| 2381 | */ | 2296 | */ |
| 2382 | worker_enter_idle(worker); | 2297 | worker_enter_idle(worker); |
| 2383 | __set_current_state(TASK_INTERRUPTIBLE); | 2298 | __set_current_state(TASK_INTERRUPTIBLE); |
| 2384 | spin_unlock_irq(&gcwq->lock); | 2299 | spin_unlock_irq(&pool->lock); |
| 2385 | schedule(); | 2300 | schedule(); |
| 2386 | goto woke_up; | 2301 | goto woke_up; |
| 2387 | } | 2302 | } |
| 2388 | 2303 | ||
| 2389 | /** | 2304 | /** |
| 2390 | * rescuer_thread - the rescuer thread function | 2305 | * rescuer_thread - the rescuer thread function |
| 2391 | * @__wq: the associated workqueue | 2306 | * @__rescuer: self |
| 2392 | * | 2307 | * |
| 2393 | * Workqueue rescuer thread function. There's one rescuer for each | 2308 | * Workqueue rescuer thread function. There's one rescuer for each |
| 2394 | * workqueue which has WQ_RESCUER set. | 2309 | * workqueue which has WQ_RESCUER set. |
| 2395 | * | 2310 | * |
| 2396 | * Regular work processing on a gcwq may block trying to create a new | 2311 | * Regular work processing on a pool may block trying to create a new |
| 2397 | * worker which uses GFP_KERNEL allocation which has slight chance of | 2312 | * worker which uses GFP_KERNEL allocation which has slight chance of |
| 2398 | * developing into deadlock if some works currently on the same queue | 2313 | * developing into deadlock if some works currently on the same queue |
| 2399 | * need to be processed to satisfy the GFP_KERNEL allocation. This is | 2314 | * need to be processed to satisfy the GFP_KERNEL allocation. This is |
| 2400 | * the problem rescuer solves. | 2315 | * the problem rescuer solves. |
| 2401 | * | 2316 | * |
| 2402 | * When such condition is possible, the gcwq summons rescuers of all | 2317 | * When such condition is possible, the pool summons rescuers of all |
| 2403 | * workqueues which have works queued on the gcwq and let them process | 2318 | * workqueues which have works queued on the pool and let them process |
| 2404 | * those works so that forward progress can be guaranteed. | 2319 | * those works so that forward progress can be guaranteed. |
| 2405 | * | 2320 | * |
| 2406 | * This should happen rarely. | 2321 | * This should happen rarely. |
| 2407 | */ | 2322 | */ |
| 2408 | static int rescuer_thread(void *__wq) | 2323 | static int rescuer_thread(void *__rescuer) |
| 2409 | { | 2324 | { |
| 2410 | struct workqueue_struct *wq = __wq; | 2325 | struct worker *rescuer = __rescuer; |
| 2411 | struct worker *rescuer = wq->rescuer; | 2326 | struct workqueue_struct *wq = rescuer->rescue_wq; |
| 2412 | struct list_head *scheduled = &rescuer->scheduled; | 2327 | struct list_head *scheduled = &rescuer->scheduled; |
| 2413 | bool is_unbound = wq->flags & WQ_UNBOUND; | 2328 | bool is_unbound = wq->flags & WQ_UNBOUND; |
| 2414 | unsigned int cpu; | 2329 | unsigned int cpu; |
| 2415 | 2330 | ||
| 2416 | set_user_nice(current, RESCUER_NICE_LEVEL); | 2331 | set_user_nice(current, RESCUER_NICE_LEVEL); |
| 2332 | |||
| 2333 | /* | ||
| 2334 | * Mark rescuer as worker too. As WORKER_PREP is never cleared, it | ||
| 2335 | * doesn't participate in concurrency management. | ||
| 2336 | */ | ||
| 2337 | rescuer->task->flags |= PF_WQ_WORKER; | ||
| 2417 | repeat: | 2338 | repeat: |
| 2418 | set_current_state(TASK_INTERRUPTIBLE); | 2339 | set_current_state(TASK_INTERRUPTIBLE); |
| 2419 | 2340 | ||
| 2420 | if (kthread_should_stop()) { | 2341 | if (kthread_should_stop()) { |
| 2421 | __set_current_state(TASK_RUNNING); | 2342 | __set_current_state(TASK_RUNNING); |
| 2343 | rescuer->task->flags &= ~PF_WQ_WORKER; | ||
| 2422 | return 0; | 2344 | return 0; |
| 2423 | } | 2345 | } |
| 2424 | 2346 | ||
| @@ -2428,9 +2350,8 @@ repeat: | |||
| 2428 | */ | 2350 | */ |
| 2429 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | 2351 | for_each_mayday_cpu(cpu, wq->mayday_mask) { |
| 2430 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | 2352 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; |
| 2431 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); | 2353 | struct pool_workqueue *pwq = get_pwq(tcpu, wq); |
| 2432 | struct worker_pool *pool = cwq->pool; | 2354 | struct worker_pool *pool = pwq->pool; |
| 2433 | struct global_cwq *gcwq = pool->gcwq; | ||
| 2434 | struct work_struct *work, *n; | 2355 | struct work_struct *work, *n; |
| 2435 | 2356 | ||
| 2436 | __set_current_state(TASK_RUNNING); | 2357 | __set_current_state(TASK_RUNNING); |
| @@ -2446,22 +2367,24 @@ repeat: | |||
| 2446 | */ | 2367 | */ |
| 2447 | BUG_ON(!list_empty(&rescuer->scheduled)); | 2368 | BUG_ON(!list_empty(&rescuer->scheduled)); |
| 2448 | list_for_each_entry_safe(work, n, &pool->worklist, entry) | 2369 | list_for_each_entry_safe(work, n, &pool->worklist, entry) |
| 2449 | if (get_work_cwq(work) == cwq) | 2370 | if (get_work_pwq(work) == pwq) |
| 2450 | move_linked_works(work, scheduled, &n); | 2371 | move_linked_works(work, scheduled, &n); |
| 2451 | 2372 | ||
| 2452 | process_scheduled_works(rescuer); | 2373 | process_scheduled_works(rescuer); |
| 2453 | 2374 | ||
| 2454 | /* | 2375 | /* |
| 2455 | * Leave this gcwq. If keep_working() is %true, notify a | 2376 | * Leave this pool. If keep_working() is %true, notify a |
| 2456 | * regular worker; otherwise, we end up with 0 concurrency | 2377 | * regular worker; otherwise, we end up with 0 concurrency |
| 2457 | * and stalling the execution. | 2378 | * and stalling the execution. |
| 2458 | */ | 2379 | */ |
| 2459 | if (keep_working(pool)) | 2380 | if (keep_working(pool)) |
| 2460 | wake_up_worker(pool); | 2381 | wake_up_worker(pool); |
| 2461 | 2382 | ||
| 2462 | spin_unlock_irq(&gcwq->lock); | 2383 | spin_unlock_irq(&pool->lock); |
| 2463 | } | 2384 | } |
| 2464 | 2385 | ||
| 2386 | /* rescuers should never participate in concurrency management */ | ||
| 2387 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); | ||
| 2465 | schedule(); | 2388 | schedule(); |
| 2466 | goto repeat; | 2389 | goto repeat; |
| 2467 | } | 2390 | } |
| @@ -2479,7 +2402,7 @@ static void wq_barrier_func(struct work_struct *work) | |||
| 2479 | 2402 | ||
| 2480 | /** | 2403 | /** |
| 2481 | * insert_wq_barrier - insert a barrier work | 2404 | * insert_wq_barrier - insert a barrier work |
| 2482 | * @cwq: cwq to insert barrier into | 2405 | * @pwq: pwq to insert barrier into |
| 2483 | * @barr: wq_barrier to insert | 2406 | * @barr: wq_barrier to insert |
| 2484 | * @target: target work to attach @barr to | 2407 | * @target: target work to attach @barr to |
| 2485 | * @worker: worker currently executing @target, NULL if @target is not executing | 2408 | * @worker: worker currently executing @target, NULL if @target is not executing |
| @@ -2496,12 +2419,12 @@ static void wq_barrier_func(struct work_struct *work) | |||
| 2496 | * after a work with LINKED flag set. | 2419 | * after a work with LINKED flag set. |
| 2497 | * | 2420 | * |
| 2498 | * Note that when @worker is non-NULL, @target may be modified | 2421 | * Note that when @worker is non-NULL, @target may be modified |
| 2499 | * underneath us, so we can't reliably determine cwq from @target. | 2422 | * underneath us, so we can't reliably determine pwq from @target. |
| 2500 | * | 2423 | * |
| 2501 | * CONTEXT: | 2424 | * CONTEXT: |
| 2502 | * spin_lock_irq(gcwq->lock). | 2425 | * spin_lock_irq(pool->lock). |
| 2503 | */ | 2426 | */ |
| 2504 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | 2427 | static void insert_wq_barrier(struct pool_workqueue *pwq, |
| 2505 | struct wq_barrier *barr, | 2428 | struct wq_barrier *barr, |
| 2506 | struct work_struct *target, struct worker *worker) | 2429 | struct work_struct *target, struct worker *worker) |
| 2507 | { | 2430 | { |
| @@ -2509,7 +2432,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | |||
| 2509 | unsigned int linked = 0; | 2432 | unsigned int linked = 0; |
| 2510 | 2433 | ||
| 2511 | /* | 2434 | /* |
| 2512 | * debugobject calls are safe here even with gcwq->lock locked | 2435 | * debugobject calls are safe here even with pool->lock locked |
| 2513 | * as we know for sure that this will not trigger any of the | 2436 | * as we know for sure that this will not trigger any of the |
| 2514 | * checks and call back into the fixup functions where we | 2437 | * checks and call back into the fixup functions where we |
| 2515 | * might deadlock. | 2438 | * might deadlock. |
| @@ -2534,23 +2457,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | |||
| 2534 | } | 2457 | } |
| 2535 | 2458 | ||
| 2536 | debug_work_activate(&barr->work); | 2459 | debug_work_activate(&barr->work); |
| 2537 | insert_work(cwq, &barr->work, head, | 2460 | insert_work(pwq, &barr->work, head, |
| 2538 | work_color_to_flags(WORK_NO_COLOR) | linked); | 2461 | work_color_to_flags(WORK_NO_COLOR) | linked); |
| 2539 | } | 2462 | } |
| 2540 | 2463 | ||
| 2541 | /** | 2464 | /** |
| 2542 | * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing | 2465 | * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing |
| 2543 | * @wq: workqueue being flushed | 2466 | * @wq: workqueue being flushed |
| 2544 | * @flush_color: new flush color, < 0 for no-op | 2467 | * @flush_color: new flush color, < 0 for no-op |
| 2545 | * @work_color: new work color, < 0 for no-op | 2468 | * @work_color: new work color, < 0 for no-op |
| 2546 | * | 2469 | * |
| 2547 | * Prepare cwqs for workqueue flushing. | 2470 | * Prepare pwqs for workqueue flushing. |
| 2548 | * | 2471 | * |
| 2549 | * If @flush_color is non-negative, flush_color on all cwqs should be | 2472 | * If @flush_color is non-negative, flush_color on all pwqs should be |
| 2550 | * -1. If no cwq has in-flight commands at the specified color, all | 2473 | * -1. If no pwq has in-flight commands at the specified color, all |
| 2551 | * cwq->flush_color's stay at -1 and %false is returned. If any cwq | 2474 | * pwq->flush_color's stay at -1 and %false is returned. If any pwq |
| 2552 | * has in flight commands, its cwq->flush_color is set to | 2475 | * has in flight commands, its pwq->flush_color is set to |
| 2553 | * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq | 2476 | * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq |
| 2554 | * wakeup logic is armed and %true is returned. | 2477 | * wakeup logic is armed and %true is returned. |
| 2555 | * | 2478 | * |
| 2556 | * The caller should have initialized @wq->first_flusher prior to | 2479 | * The caller should have initialized @wq->first_flusher prior to |
| @@ -2558,7 +2481,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | |||
| 2558 | * @flush_color is negative, no flush color update is done and %false | 2481 | * @flush_color is negative, no flush color update is done and %false |
| 2559 | * is returned. | 2482 | * is returned. |
| 2560 | * | 2483 | * |
| 2561 | * If @work_color is non-negative, all cwqs should have the same | 2484 | * If @work_color is non-negative, all pwqs should have the same |
| 2562 | * work_color which is previous to @work_color and all will be | 2485 | * work_color which is previous to @work_color and all will be |
| 2563 | * advanced to @work_color. | 2486 | * advanced to @work_color. |
| 2564 | * | 2487 | * |
| @@ -2569,42 +2492,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | |||
| 2569 | * %true if @flush_color >= 0 and there's something to flush. %false | 2492 | * %true if @flush_color >= 0 and there's something to flush. %false |
| 2570 | * otherwise. | 2493 | * otherwise. |
| 2571 | */ | 2494 | */ |
| 2572 | static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, | 2495 | static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, |
| 2573 | int flush_color, int work_color) | 2496 | int flush_color, int work_color) |
| 2574 | { | 2497 | { |
| 2575 | bool wait = false; | 2498 | bool wait = false; |
| 2576 | unsigned int cpu; | 2499 | unsigned int cpu; |
| 2577 | 2500 | ||
| 2578 | if (flush_color >= 0) { | 2501 | if (flush_color >= 0) { |
| 2579 | BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); | 2502 | BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); |
| 2580 | atomic_set(&wq->nr_cwqs_to_flush, 1); | 2503 | atomic_set(&wq->nr_pwqs_to_flush, 1); |
| 2581 | } | 2504 | } |
| 2582 | 2505 | ||
| 2583 | for_each_cwq_cpu(cpu, wq) { | 2506 | for_each_pwq_cpu(cpu, wq) { |
| 2584 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2507 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 2585 | struct global_cwq *gcwq = cwq->pool->gcwq; | 2508 | struct worker_pool *pool = pwq->pool; |
| 2586 | 2509 | ||
| 2587 | spin_lock_irq(&gcwq->lock); | 2510 | spin_lock_irq(&pool->lock); |
| 2588 | 2511 | ||
| 2589 | if (flush_color >= 0) { | 2512 | if (flush_color >= 0) { |
| 2590 | BUG_ON(cwq->flush_color != -1); | 2513 | BUG_ON(pwq->flush_color != -1); |
| 2591 | 2514 | ||
| 2592 | if (cwq->nr_in_flight[flush_color]) { | 2515 | if (pwq->nr_in_flight[flush_color]) { |
| 2593 | cwq->flush_color = flush_color; | 2516 | pwq->flush_color = flush_color; |
| 2594 | atomic_inc(&wq->nr_cwqs_to_flush); | 2517 | atomic_inc(&wq->nr_pwqs_to_flush); |
| 2595 | wait = true; | 2518 | wait = true; |
| 2596 | } | 2519 | } |
| 2597 | } | 2520 | } |
| 2598 | 2521 | ||
| 2599 | if (work_color >= 0) { | 2522 | if (work_color >= 0) { |
| 2600 | BUG_ON(work_color != work_next_color(cwq->work_color)); | 2523 | BUG_ON(work_color != work_next_color(pwq->work_color)); |
| 2601 | cwq->work_color = work_color; | 2524 | pwq->work_color = work_color; |
| 2602 | } | 2525 | } |
| 2603 | 2526 | ||
| 2604 | spin_unlock_irq(&gcwq->lock); | 2527 | spin_unlock_irq(&pool->lock); |
| 2605 | } | 2528 | } |
| 2606 | 2529 | ||
| 2607 | if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) | 2530 | if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) |
| 2608 | complete(&wq->first_flusher->done); | 2531 | complete(&wq->first_flusher->done); |
| 2609 | 2532 | ||
| 2610 | return wait; | 2533 | return wait; |
| @@ -2655,7 +2578,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
| 2655 | 2578 | ||
| 2656 | wq->first_flusher = &this_flusher; | 2579 | wq->first_flusher = &this_flusher; |
| 2657 | 2580 | ||
| 2658 | if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, | 2581 | if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, |
| 2659 | wq->work_color)) { | 2582 | wq->work_color)) { |
| 2660 | /* nothing to flush, done */ | 2583 | /* nothing to flush, done */ |
| 2661 | wq->flush_color = next_color; | 2584 | wq->flush_color = next_color; |
| @@ -2666,7 +2589,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
| 2666 | /* wait in queue */ | 2589 | /* wait in queue */ |
| 2667 | BUG_ON(wq->flush_color == this_flusher.flush_color); | 2590 | BUG_ON(wq->flush_color == this_flusher.flush_color); |
| 2668 | list_add_tail(&this_flusher.list, &wq->flusher_queue); | 2591 | list_add_tail(&this_flusher.list, &wq->flusher_queue); |
| 2669 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | 2592 | flush_workqueue_prep_pwqs(wq, -1, wq->work_color); |
| 2670 | } | 2593 | } |
| 2671 | } else { | 2594 | } else { |
| 2672 | /* | 2595 | /* |
| @@ -2733,7 +2656,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
| 2733 | 2656 | ||
| 2734 | list_splice_tail_init(&wq->flusher_overflow, | 2657 | list_splice_tail_init(&wq->flusher_overflow, |
| 2735 | &wq->flusher_queue); | 2658 | &wq->flusher_queue); |
| 2736 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | 2659 | flush_workqueue_prep_pwqs(wq, -1, wq->work_color); |
| 2737 | } | 2660 | } |
| 2738 | 2661 | ||
| 2739 | if (list_empty(&wq->flusher_queue)) { | 2662 | if (list_empty(&wq->flusher_queue)) { |
| @@ -2743,7 +2666,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
| 2743 | 2666 | ||
| 2744 | /* | 2667 | /* |
| 2745 | * Need to flush more colors. Make the next flusher | 2668 | * Need to flush more colors. Make the next flusher |
| 2746 | * the new first flusher and arm cwqs. | 2669 | * the new first flusher and arm pwqs. |
| 2747 | */ | 2670 | */ |
| 2748 | BUG_ON(wq->flush_color == wq->work_color); | 2671 | BUG_ON(wq->flush_color == wq->work_color); |
| 2749 | BUG_ON(wq->flush_color != next->flush_color); | 2672 | BUG_ON(wq->flush_color != next->flush_color); |
| @@ -2751,7 +2674,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
| 2751 | list_del_init(&next->list); | 2674 | list_del_init(&next->list); |
| 2752 | wq->first_flusher = next; | 2675 | wq->first_flusher = next; |
| 2753 | 2676 | ||
| 2754 | if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) | 2677 | if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) |
| 2755 | break; | 2678 | break; |
| 2756 | 2679 | ||
| 2757 | /* | 2680 | /* |
| @@ -2794,13 +2717,13 @@ void drain_workqueue(struct workqueue_struct *wq) | |||
| 2794 | reflush: | 2717 | reflush: |
| 2795 | flush_workqueue(wq); | 2718 | flush_workqueue(wq); |
| 2796 | 2719 | ||
| 2797 | for_each_cwq_cpu(cpu, wq) { | 2720 | for_each_pwq_cpu(cpu, wq) { |
| 2798 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2721 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 2799 | bool drained; | 2722 | bool drained; |
| 2800 | 2723 | ||
| 2801 | spin_lock_irq(&cwq->pool->gcwq->lock); | 2724 | spin_lock_irq(&pwq->pool->lock); |
| 2802 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); | 2725 | drained = !pwq->nr_active && list_empty(&pwq->delayed_works); |
| 2803 | spin_unlock_irq(&cwq->pool->gcwq->lock); | 2726 | spin_unlock_irq(&pwq->pool->lock); |
| 2804 | 2727 | ||
| 2805 | if (drained) | 2728 | if (drained) |
| 2806 | continue; | 2729 | continue; |
| @@ -2822,34 +2745,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue); | |||
| 2822 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) | 2745 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) |
| 2823 | { | 2746 | { |
| 2824 | struct worker *worker = NULL; | 2747 | struct worker *worker = NULL; |
| 2825 | struct global_cwq *gcwq; | 2748 | struct worker_pool *pool; |
| 2826 | struct cpu_workqueue_struct *cwq; | 2749 | struct pool_workqueue *pwq; |
| 2827 | 2750 | ||
| 2828 | might_sleep(); | 2751 | might_sleep(); |
| 2829 | gcwq = get_work_gcwq(work); | 2752 | pool = get_work_pool(work); |
| 2830 | if (!gcwq) | 2753 | if (!pool) |
| 2831 | return false; | 2754 | return false; |
| 2832 | 2755 | ||
| 2833 | spin_lock_irq(&gcwq->lock); | 2756 | spin_lock_irq(&pool->lock); |
| 2834 | if (!list_empty(&work->entry)) { | 2757 | /* see the comment in try_to_grab_pending() with the same code */ |
| 2835 | /* | 2758 | pwq = get_work_pwq(work); |
| 2836 | * See the comment near try_to_grab_pending()->smp_rmb(). | 2759 | if (pwq) { |
| 2837 | * If it was re-queued to a different gcwq under us, we | 2760 | if (unlikely(pwq->pool != pool)) |
| 2838 | * are not going to wait. | ||
| 2839 | */ | ||
| 2840 | smp_rmb(); | ||
| 2841 | cwq = get_work_cwq(work); | ||
| 2842 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) | ||
| 2843 | goto already_gone; | 2761 | goto already_gone; |
| 2844 | } else { | 2762 | } else { |
| 2845 | worker = find_worker_executing_work(gcwq, work); | 2763 | worker = find_worker_executing_work(pool, work); |
| 2846 | if (!worker) | 2764 | if (!worker) |
| 2847 | goto already_gone; | 2765 | goto already_gone; |
| 2848 | cwq = worker->current_cwq; | 2766 | pwq = worker->current_pwq; |
| 2849 | } | 2767 | } |
| 2850 | 2768 | ||
| 2851 | insert_wq_barrier(cwq, barr, work, worker); | 2769 | insert_wq_barrier(pwq, barr, work, worker); |
| 2852 | spin_unlock_irq(&gcwq->lock); | 2770 | spin_unlock_irq(&pool->lock); |
| 2853 | 2771 | ||
| 2854 | /* | 2772 | /* |
| 2855 | * If @max_active is 1 or rescuer is in use, flushing another work | 2773 | * If @max_active is 1 or rescuer is in use, flushing another work |
| @@ -2857,15 +2775,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) | |||
| 2857 | * flusher is not running on the same workqueue by verifying write | 2775 | * flusher is not running on the same workqueue by verifying write |
| 2858 | * access. | 2776 | * access. |
| 2859 | */ | 2777 | */ |
| 2860 | if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) | 2778 | if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) |
| 2861 | lock_map_acquire(&cwq->wq->lockdep_map); | 2779 | lock_map_acquire(&pwq->wq->lockdep_map); |
| 2862 | else | 2780 | else |
| 2863 | lock_map_acquire_read(&cwq->wq->lockdep_map); | 2781 | lock_map_acquire_read(&pwq->wq->lockdep_map); |
| 2864 | lock_map_release(&cwq->wq->lockdep_map); | 2782 | lock_map_release(&pwq->wq->lockdep_map); |
| 2865 | 2783 | ||
| 2866 | return true; | 2784 | return true; |
| 2867 | already_gone: | 2785 | already_gone: |
| 2868 | spin_unlock_irq(&gcwq->lock); | 2786 | spin_unlock_irq(&pool->lock); |
| 2869 | return false; | 2787 | return false; |
| 2870 | } | 2788 | } |
| 2871 | 2789 | ||
| @@ -2961,8 +2879,7 @@ bool flush_delayed_work(struct delayed_work *dwork) | |||
| 2961 | { | 2879 | { |
| 2962 | local_irq_disable(); | 2880 | local_irq_disable(); |
| 2963 | if (del_timer_sync(&dwork->timer)) | 2881 | if (del_timer_sync(&dwork->timer)) |
| 2964 | __queue_work(dwork->cpu, | 2882 | __queue_work(dwork->cpu, dwork->wq, &dwork->work); |
| 2965 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
| 2966 | local_irq_enable(); | 2883 | local_irq_enable(); |
| 2967 | return flush_work(&dwork->work); | 2884 | return flush_work(&dwork->work); |
| 2968 | } | 2885 | } |
| @@ -2992,7 +2909,8 @@ bool cancel_delayed_work(struct delayed_work *dwork) | |||
| 2992 | if (unlikely(ret < 0)) | 2909 | if (unlikely(ret < 0)) |
| 2993 | return false; | 2910 | return false; |
| 2994 | 2911 | ||
| 2995 | set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); | 2912 | set_work_pool_and_clear_pending(&dwork->work, |
| 2913 | get_work_pool_id(&dwork->work)); | ||
| 2996 | local_irq_restore(flags); | 2914 | local_irq_restore(flags); |
| 2997 | return ret; | 2915 | return ret; |
| 2998 | } | 2916 | } |
| @@ -3171,46 +3089,46 @@ int keventd_up(void) | |||
| 3171 | return system_wq != NULL; | 3089 | return system_wq != NULL; |
| 3172 | } | 3090 | } |
| 3173 | 3091 | ||
| 3174 | static int alloc_cwqs(struct workqueue_struct *wq) | 3092 | static int alloc_pwqs(struct workqueue_struct *wq) |
| 3175 | { | 3093 | { |
| 3176 | /* | 3094 | /* |
| 3177 | * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. | 3095 | * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. |
| 3178 | * Make sure that the alignment isn't lower than that of | 3096 | * Make sure that the alignment isn't lower than that of |
| 3179 | * unsigned long long. | 3097 | * unsigned long long. |
| 3180 | */ | 3098 | */ |
| 3181 | const size_t size = sizeof(struct cpu_workqueue_struct); | 3099 | const size_t size = sizeof(struct pool_workqueue); |
| 3182 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, | 3100 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, |
| 3183 | __alignof__(unsigned long long)); | 3101 | __alignof__(unsigned long long)); |
| 3184 | 3102 | ||
| 3185 | if (!(wq->flags & WQ_UNBOUND)) | 3103 | if (!(wq->flags & WQ_UNBOUND)) |
| 3186 | wq->cpu_wq.pcpu = __alloc_percpu(size, align); | 3104 | wq->pool_wq.pcpu = __alloc_percpu(size, align); |
| 3187 | else { | 3105 | else { |
| 3188 | void *ptr; | 3106 | void *ptr; |
| 3189 | 3107 | ||
| 3190 | /* | 3108 | /* |
| 3191 | * Allocate enough room to align cwq and put an extra | 3109 | * Allocate enough room to align pwq and put an extra |
| 3192 | * pointer at the end pointing back to the originally | 3110 | * pointer at the end pointing back to the originally |
| 3193 | * allocated pointer which will be used for free. | 3111 | * allocated pointer which will be used for free. |
| 3194 | */ | 3112 | */ |
| 3195 | ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); | 3113 | ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); |
| 3196 | if (ptr) { | 3114 | if (ptr) { |
| 3197 | wq->cpu_wq.single = PTR_ALIGN(ptr, align); | 3115 | wq->pool_wq.single = PTR_ALIGN(ptr, align); |
| 3198 | *(void **)(wq->cpu_wq.single + 1) = ptr; | 3116 | *(void **)(wq->pool_wq.single + 1) = ptr; |
| 3199 | } | 3117 | } |
| 3200 | } | 3118 | } |
| 3201 | 3119 | ||
| 3202 | /* just in case, make sure it's actually aligned */ | 3120 | /* just in case, make sure it's actually aligned */ |
| 3203 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | 3121 | BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); |
| 3204 | return wq->cpu_wq.v ? 0 : -ENOMEM; | 3122 | return wq->pool_wq.v ? 0 : -ENOMEM; |
| 3205 | } | 3123 | } |
| 3206 | 3124 | ||
| 3207 | static void free_cwqs(struct workqueue_struct *wq) | 3125 | static void free_pwqs(struct workqueue_struct *wq) |
| 3208 | { | 3126 | { |
| 3209 | if (!(wq->flags & WQ_UNBOUND)) | 3127 | if (!(wq->flags & WQ_UNBOUND)) |
| 3210 | free_percpu(wq->cpu_wq.pcpu); | 3128 | free_percpu(wq->pool_wq.pcpu); |
| 3211 | else if (wq->cpu_wq.single) { | 3129 | else if (wq->pool_wq.single) { |
| 3212 | /* the pointer to free is stored right after the cwq */ | 3130 | /* the pointer to free is stored right after the pwq */ |
| 3213 | kfree(*(void **)(wq->cpu_wq.single + 1)); | 3131 | kfree(*(void **)(wq->pool_wq.single + 1)); |
| 3214 | } | 3132 | } |
| 3215 | } | 3133 | } |
| 3216 | 3134 | ||
| @@ -3264,27 +3182,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
| 3264 | wq->flags = flags; | 3182 | wq->flags = flags; |
| 3265 | wq->saved_max_active = max_active; | 3183 | wq->saved_max_active = max_active; |
| 3266 | mutex_init(&wq->flush_mutex); | 3184 | mutex_init(&wq->flush_mutex); |
| 3267 | atomic_set(&wq->nr_cwqs_to_flush, 0); | 3185 | atomic_set(&wq->nr_pwqs_to_flush, 0); |
| 3268 | INIT_LIST_HEAD(&wq->flusher_queue); | 3186 | INIT_LIST_HEAD(&wq->flusher_queue); |
| 3269 | INIT_LIST_HEAD(&wq->flusher_overflow); | 3187 | INIT_LIST_HEAD(&wq->flusher_overflow); |
| 3270 | 3188 | ||
| 3271 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 3189 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
| 3272 | INIT_LIST_HEAD(&wq->list); | 3190 | INIT_LIST_HEAD(&wq->list); |
| 3273 | 3191 | ||
| 3274 | if (alloc_cwqs(wq) < 0) | 3192 | if (alloc_pwqs(wq) < 0) |
| 3275 | goto err; | 3193 | goto err; |
| 3276 | 3194 | ||
| 3277 | for_each_cwq_cpu(cpu, wq) { | 3195 | for_each_pwq_cpu(cpu, wq) { |
| 3278 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3196 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3279 | struct global_cwq *gcwq = get_gcwq(cpu); | 3197 | |
| 3280 | int pool_idx = (bool)(flags & WQ_HIGHPRI); | 3198 | BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); |
| 3281 | 3199 | pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); | |
| 3282 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); | 3200 | pwq->wq = wq; |
| 3283 | cwq->pool = &gcwq->pools[pool_idx]; | 3201 | pwq->flush_color = -1; |
| 3284 | cwq->wq = wq; | 3202 | pwq->max_active = max_active; |
| 3285 | cwq->flush_color = -1; | 3203 | INIT_LIST_HEAD(&pwq->delayed_works); |
| 3286 | cwq->max_active = max_active; | ||
| 3287 | INIT_LIST_HEAD(&cwq->delayed_works); | ||
| 3288 | } | 3204 | } |
| 3289 | 3205 | ||
| 3290 | if (flags & WQ_RESCUER) { | 3206 | if (flags & WQ_RESCUER) { |
| @@ -3297,7 +3213,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
| 3297 | if (!rescuer) | 3213 | if (!rescuer) |
| 3298 | goto err; | 3214 | goto err; |
| 3299 | 3215 | ||
| 3300 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", | 3216 | rescuer->rescue_wq = wq; |
| 3217 | rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", | ||
| 3301 | wq->name); | 3218 | wq->name); |
| 3302 | if (IS_ERR(rescuer->task)) | 3219 | if (IS_ERR(rescuer->task)) |
| 3303 | goto err; | 3220 | goto err; |
| @@ -3314,8 +3231,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
| 3314 | spin_lock(&workqueue_lock); | 3231 | spin_lock(&workqueue_lock); |
| 3315 | 3232 | ||
| 3316 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) | 3233 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) |
| 3317 | for_each_cwq_cpu(cpu, wq) | 3234 | for_each_pwq_cpu(cpu, wq) |
| 3318 | get_cwq(cpu, wq)->max_active = 0; | 3235 | get_pwq(cpu, wq)->max_active = 0; |
| 3319 | 3236 | ||
| 3320 | list_add(&wq->list, &workqueues); | 3237 | list_add(&wq->list, &workqueues); |
| 3321 | 3238 | ||
| @@ -3324,7 +3241,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
| 3324 | return wq; | 3241 | return wq; |
| 3325 | err: | 3242 | err: |
| 3326 | if (wq) { | 3243 | if (wq) { |
| 3327 | free_cwqs(wq); | 3244 | free_pwqs(wq); |
| 3328 | free_mayday_mask(wq->mayday_mask); | 3245 | free_mayday_mask(wq->mayday_mask); |
| 3329 | kfree(wq->rescuer); | 3246 | kfree(wq->rescuer); |
| 3330 | kfree(wq); | 3247 | kfree(wq); |
| @@ -3355,14 +3272,14 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 3355 | spin_unlock(&workqueue_lock); | 3272 | spin_unlock(&workqueue_lock); |
| 3356 | 3273 | ||
| 3357 | /* sanity check */ | 3274 | /* sanity check */ |
| 3358 | for_each_cwq_cpu(cpu, wq) { | 3275 | for_each_pwq_cpu(cpu, wq) { |
| 3359 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3276 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3360 | int i; | 3277 | int i; |
| 3361 | 3278 | ||
| 3362 | for (i = 0; i < WORK_NR_COLORS; i++) | 3279 | for (i = 0; i < WORK_NR_COLORS; i++) |
| 3363 | BUG_ON(cwq->nr_in_flight[i]); | 3280 | BUG_ON(pwq->nr_in_flight[i]); |
| 3364 | BUG_ON(cwq->nr_active); | 3281 | BUG_ON(pwq->nr_active); |
| 3365 | BUG_ON(!list_empty(&cwq->delayed_works)); | 3282 | BUG_ON(!list_empty(&pwq->delayed_works)); |
| 3366 | } | 3283 | } |
| 3367 | 3284 | ||
| 3368 | if (wq->flags & WQ_RESCUER) { | 3285 | if (wq->flags & WQ_RESCUER) { |
| @@ -3371,29 +3288,29 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 3371 | kfree(wq->rescuer); | 3288 | kfree(wq->rescuer); |
| 3372 | } | 3289 | } |
| 3373 | 3290 | ||
| 3374 | free_cwqs(wq); | 3291 | free_pwqs(wq); |
| 3375 | kfree(wq); | 3292 | kfree(wq); |
| 3376 | } | 3293 | } |
| 3377 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 3294 | EXPORT_SYMBOL_GPL(destroy_workqueue); |
| 3378 | 3295 | ||
| 3379 | /** | 3296 | /** |
| 3380 | * cwq_set_max_active - adjust max_active of a cwq | 3297 | * pwq_set_max_active - adjust max_active of a pwq |
| 3381 | * @cwq: target cpu_workqueue_struct | 3298 | * @pwq: target pool_workqueue |
| 3382 | * @max_active: new max_active value. | 3299 | * @max_active: new max_active value. |
| 3383 | * | 3300 | * |
| 3384 | * Set @cwq->max_active to @max_active and activate delayed works if | 3301 | * Set @pwq->max_active to @max_active and activate delayed works if |
| 3385 | * increased. | 3302 | * increased. |
| 3386 | * | 3303 | * |
| 3387 | * CONTEXT: | 3304 | * CONTEXT: |
| 3388 | * spin_lock_irq(gcwq->lock). | 3305 | * spin_lock_irq(pool->lock). |
| 3389 | */ | 3306 | */ |
| 3390 | static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) | 3307 | static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) |
| 3391 | { | 3308 | { |
| 3392 | cwq->max_active = max_active; | 3309 | pwq->max_active = max_active; |
| 3393 | 3310 | ||
| 3394 | while (!list_empty(&cwq->delayed_works) && | 3311 | while (!list_empty(&pwq->delayed_works) && |
| 3395 | cwq->nr_active < cwq->max_active) | 3312 | pwq->nr_active < pwq->max_active) |
| 3396 | cwq_activate_first_delayed(cwq); | 3313 | pwq_activate_first_delayed(pwq); |
| 3397 | } | 3314 | } |
| 3398 | 3315 | ||
| 3399 | /** | 3316 | /** |
| @@ -3416,16 +3333,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
| 3416 | 3333 | ||
| 3417 | wq->saved_max_active = max_active; | 3334 | wq->saved_max_active = max_active; |
| 3418 | 3335 | ||
| 3419 | for_each_cwq_cpu(cpu, wq) { | 3336 | for_each_pwq_cpu(cpu, wq) { |
| 3420 | struct global_cwq *gcwq = get_gcwq(cpu); | 3337 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3338 | struct worker_pool *pool = pwq->pool; | ||
| 3421 | 3339 | ||
| 3422 | spin_lock_irq(&gcwq->lock); | 3340 | spin_lock_irq(&pool->lock); |
| 3423 | 3341 | ||
| 3424 | if (!(wq->flags & WQ_FREEZABLE) || | 3342 | if (!(wq->flags & WQ_FREEZABLE) || |
| 3425 | !(gcwq->flags & GCWQ_FREEZING)) | 3343 | !(pool->flags & POOL_FREEZING)) |
| 3426 | cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); | 3344 | pwq_set_max_active(pwq, max_active); |
| 3427 | 3345 | ||
| 3428 | spin_unlock_irq(&gcwq->lock); | 3346 | spin_unlock_irq(&pool->lock); |
| 3429 | } | 3347 | } |
| 3430 | 3348 | ||
| 3431 | spin_unlock(&workqueue_lock); | 3349 | spin_unlock(&workqueue_lock); |
| @@ -3446,57 +3364,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); | |||
| 3446 | */ | 3364 | */ |
| 3447 | bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) | 3365 | bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) |
| 3448 | { | 3366 | { |
| 3449 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3367 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3450 | 3368 | ||
| 3451 | return !list_empty(&cwq->delayed_works); | 3369 | return !list_empty(&pwq->delayed_works); |
| 3452 | } | 3370 | } |
| 3453 | EXPORT_SYMBOL_GPL(workqueue_congested); | 3371 | EXPORT_SYMBOL_GPL(workqueue_congested); |
| 3454 | 3372 | ||
| 3455 | /** | 3373 | /** |
| 3456 | * work_cpu - return the last known associated cpu for @work | ||
| 3457 | * @work: the work of interest | ||
| 3458 | * | ||
| 3459 | * RETURNS: | ||
| 3460 | * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. | ||
| 3461 | */ | ||
| 3462 | unsigned int work_cpu(struct work_struct *work) | ||
| 3463 | { | ||
| 3464 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 3465 | |||
| 3466 | return gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
| 3467 | } | ||
| 3468 | EXPORT_SYMBOL_GPL(work_cpu); | ||
| 3469 | |||
| 3470 | /** | ||
| 3471 | * work_busy - test whether a work is currently pending or running | 3374 | * work_busy - test whether a work is currently pending or running |
| 3472 | * @work: the work to be tested | 3375 | * @work: the work to be tested |
| 3473 | * | 3376 | * |
| 3474 | * Test whether @work is currently pending or running. There is no | 3377 | * Test whether @work is currently pending or running. There is no |
| 3475 | * synchronization around this function and the test result is | 3378 | * synchronization around this function and the test result is |
| 3476 | * unreliable and only useful as advisory hints or for debugging. | 3379 | * unreliable and only useful as advisory hints or for debugging. |
| 3477 | * Especially for reentrant wqs, the pending state might hide the | ||
| 3478 | * running state. | ||
| 3479 | * | 3380 | * |
| 3480 | * RETURNS: | 3381 | * RETURNS: |
| 3481 | * OR'd bitmask of WORK_BUSY_* bits. | 3382 | * OR'd bitmask of WORK_BUSY_* bits. |
| 3482 | */ | 3383 | */ |
| 3483 | unsigned int work_busy(struct work_struct *work) | 3384 | unsigned int work_busy(struct work_struct *work) |
| 3484 | { | 3385 | { |
| 3485 | struct global_cwq *gcwq = get_work_gcwq(work); | 3386 | struct worker_pool *pool = get_work_pool(work); |
| 3486 | unsigned long flags; | 3387 | unsigned long flags; |
| 3487 | unsigned int ret = 0; | 3388 | unsigned int ret = 0; |
| 3488 | 3389 | ||
| 3489 | if (!gcwq) | ||
| 3490 | return 0; | ||
| 3491 | |||
| 3492 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 3493 | |||
| 3494 | if (work_pending(work)) | 3390 | if (work_pending(work)) |
| 3495 | ret |= WORK_BUSY_PENDING; | 3391 | ret |= WORK_BUSY_PENDING; |
| 3496 | if (find_worker_executing_work(gcwq, work)) | ||
| 3497 | ret |= WORK_BUSY_RUNNING; | ||
| 3498 | 3392 | ||
| 3499 | spin_unlock_irqrestore(&gcwq->lock, flags); | 3393 | if (pool) { |
| 3394 | spin_lock_irqsave(&pool->lock, flags); | ||
| 3395 | if (find_worker_executing_work(pool, work)) | ||
| 3396 | ret |= WORK_BUSY_RUNNING; | ||
| 3397 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 3398 | } | ||
| 3500 | 3399 | ||
| 3501 | return ret; | 3400 | return ret; |
| 3502 | } | 3401 | } |
| @@ -3506,65 +3405,48 @@ EXPORT_SYMBOL_GPL(work_busy); | |||
| 3506 | * CPU hotplug. | 3405 | * CPU hotplug. |
| 3507 | * | 3406 | * |
| 3508 | * There are two challenges in supporting CPU hotplug. Firstly, there | 3407 | * There are two challenges in supporting CPU hotplug. Firstly, there |
| 3509 | * are a lot of assumptions on strong associations among work, cwq and | 3408 | * are a lot of assumptions on strong associations among work, pwq and |
| 3510 | * gcwq which make migrating pending and scheduled works very | 3409 | * pool which make migrating pending and scheduled works very |
| 3511 | * difficult to implement without impacting hot paths. Secondly, | 3410 | * difficult to implement without impacting hot paths. Secondly, |
| 3512 | * gcwqs serve mix of short, long and very long running works making | 3411 | * worker pools serve mix of short, long and very long running works making |
| 3513 | * blocked draining impractical. | 3412 | * blocked draining impractical. |
| 3514 | * | 3413 | * |
| 3515 | * This is solved by allowing a gcwq to be disassociated from the CPU | 3414 | * This is solved by allowing the pools to be disassociated from the CPU |
| 3516 | * running as an unbound one and allowing it to be reattached later if the | 3415 | * running as an unbound one and allowing it to be reattached later if the |
| 3517 | * cpu comes back online. | 3416 | * cpu comes back online. |
| 3518 | */ | 3417 | */ |
| 3519 | 3418 | ||
| 3520 | /* claim manager positions of all pools */ | 3419 | static void wq_unbind_fn(struct work_struct *work) |
| 3521 | static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) | ||
| 3522 | { | 3420 | { |
| 3523 | struct worker_pool *pool; | 3421 | int cpu = smp_processor_id(); |
| 3524 | |||
| 3525 | for_each_worker_pool(pool, gcwq) | ||
| 3526 | mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); | ||
| 3527 | spin_lock_irq(&gcwq->lock); | ||
| 3528 | } | ||
| 3529 | |||
| 3530 | /* release manager positions */ | ||
| 3531 | static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) | ||
| 3532 | { | ||
| 3533 | struct worker_pool *pool; | ||
| 3534 | |||
| 3535 | spin_unlock_irq(&gcwq->lock); | ||
| 3536 | for_each_worker_pool(pool, gcwq) | ||
| 3537 | mutex_unlock(&pool->assoc_mutex); | ||
| 3538 | } | ||
| 3539 | |||
| 3540 | static void gcwq_unbind_fn(struct work_struct *work) | ||
| 3541 | { | ||
| 3542 | struct global_cwq *gcwq = get_gcwq(smp_processor_id()); | ||
| 3543 | struct worker_pool *pool; | 3422 | struct worker_pool *pool; |
| 3544 | struct worker *worker; | 3423 | struct worker *worker; |
| 3545 | struct hlist_node *pos; | ||
| 3546 | int i; | 3424 | int i; |
| 3547 | 3425 | ||
| 3548 | BUG_ON(gcwq->cpu != smp_processor_id()); | 3426 | for_each_std_worker_pool(pool, cpu) { |
| 3427 | BUG_ON(cpu != smp_processor_id()); | ||
| 3549 | 3428 | ||
| 3550 | gcwq_claim_assoc_and_lock(gcwq); | 3429 | mutex_lock(&pool->assoc_mutex); |
| 3430 | spin_lock_irq(&pool->lock); | ||
| 3551 | 3431 | ||
| 3552 | /* | 3432 | /* |
| 3553 | * We've claimed all manager positions. Make all workers unbound | 3433 | * We've claimed all manager positions. Make all workers |
| 3554 | * and set DISASSOCIATED. Before this, all workers except for the | 3434 | * unbound and set DISASSOCIATED. Before this, all workers |
| 3555 | * ones which are still executing works from before the last CPU | 3435 | * except for the ones which are still executing works from |
| 3556 | * down must be on the cpu. After this, they may become diasporas. | 3436 | * before the last CPU down must be on the cpu. After |
| 3557 | */ | 3437 | * this, they may become diasporas. |
| 3558 | for_each_worker_pool(pool, gcwq) | 3438 | */ |
| 3559 | list_for_each_entry(worker, &pool->idle_list, entry) | 3439 | list_for_each_entry(worker, &pool->idle_list, entry) |
| 3560 | worker->flags |= WORKER_UNBOUND; | 3440 | worker->flags |= WORKER_UNBOUND; |
| 3561 | 3441 | ||
| 3562 | for_each_busy_worker(worker, i, pos, gcwq) | 3442 | for_each_busy_worker(worker, i, pool) |
| 3563 | worker->flags |= WORKER_UNBOUND; | 3443 | worker->flags |= WORKER_UNBOUND; |
| 3564 | 3444 | ||
| 3565 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3445 | pool->flags |= POOL_DISASSOCIATED; |
| 3566 | 3446 | ||
| 3567 | gcwq_release_assoc_and_unlock(gcwq); | 3447 | spin_unlock_irq(&pool->lock); |
| 3448 | mutex_unlock(&pool->assoc_mutex); | ||
| 3449 | } | ||
| 3568 | 3450 | ||
| 3569 | /* | 3451 | /* |
| 3570 | * Call schedule() so that we cross rq->lock and thus can guarantee | 3452 | * Call schedule() so that we cross rq->lock and thus can guarantee |
| @@ -3576,16 +3458,16 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
| 3576 | /* | 3458 | /* |
| 3577 | * Sched callbacks are disabled now. Zap nr_running. After this, | 3459 | * Sched callbacks are disabled now. Zap nr_running. After this, |
| 3578 | * nr_running stays zero and need_more_worker() and keep_working() | 3460 | * nr_running stays zero and need_more_worker() and keep_working() |
| 3579 | * are always true as long as the worklist is not empty. @gcwq now | 3461 | * are always true as long as the worklist is not empty. Pools on |
| 3580 | * behaves as unbound (in terms of concurrency management) gcwq | 3462 | * @cpu now behave as unbound (in terms of concurrency management) |
| 3581 | * which is served by workers tied to the CPU. | 3463 | * pools which are served by workers tied to the CPU. |
| 3582 | * | 3464 | * |
| 3583 | * On return from this function, the current worker would trigger | 3465 | * On return from this function, the current worker would trigger |
| 3584 | * unbound chain execution of pending work items if other workers | 3466 | * unbound chain execution of pending work items if other workers |
| 3585 | * didn't already. | 3467 | * didn't already. |
| 3586 | */ | 3468 | */ |
| 3587 | for_each_worker_pool(pool, gcwq) | 3469 | for_each_std_worker_pool(pool, cpu) |
| 3588 | atomic_set(get_pool_nr_running(pool), 0); | 3470 | atomic_set(&pool->nr_running, 0); |
| 3589 | } | 3471 | } |
| 3590 | 3472 | ||
| 3591 | /* | 3473 | /* |
| @@ -3597,12 +3479,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
| 3597 | void *hcpu) | 3479 | void *hcpu) |
| 3598 | { | 3480 | { |
| 3599 | unsigned int cpu = (unsigned long)hcpu; | 3481 | unsigned int cpu = (unsigned long)hcpu; |
| 3600 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3601 | struct worker_pool *pool; | 3482 | struct worker_pool *pool; |
| 3602 | 3483 | ||
| 3603 | switch (action & ~CPU_TASKS_FROZEN) { | 3484 | switch (action & ~CPU_TASKS_FROZEN) { |
| 3604 | case CPU_UP_PREPARE: | 3485 | case CPU_UP_PREPARE: |
| 3605 | for_each_worker_pool(pool, gcwq) { | 3486 | for_each_std_worker_pool(pool, cpu) { |
| 3606 | struct worker *worker; | 3487 | struct worker *worker; |
| 3607 | 3488 | ||
| 3608 | if (pool->nr_workers) | 3489 | if (pool->nr_workers) |
| @@ -3612,18 +3493,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
| 3612 | if (!worker) | 3493 | if (!worker) |
| 3613 | return NOTIFY_BAD; | 3494 | return NOTIFY_BAD; |
| 3614 | 3495 | ||
| 3615 | spin_lock_irq(&gcwq->lock); | 3496 | spin_lock_irq(&pool->lock); |
| 3616 | start_worker(worker); | 3497 | start_worker(worker); |
| 3617 | spin_unlock_irq(&gcwq->lock); | 3498 | spin_unlock_irq(&pool->lock); |
| 3618 | } | 3499 | } |
| 3619 | break; | 3500 | break; |
| 3620 | 3501 | ||
| 3621 | case CPU_DOWN_FAILED: | 3502 | case CPU_DOWN_FAILED: |
| 3622 | case CPU_ONLINE: | 3503 | case CPU_ONLINE: |
| 3623 | gcwq_claim_assoc_and_lock(gcwq); | 3504 | for_each_std_worker_pool(pool, cpu) { |
| 3624 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3505 | mutex_lock(&pool->assoc_mutex); |
| 3625 | rebind_workers(gcwq); | 3506 | spin_lock_irq(&pool->lock); |
| 3626 | gcwq_release_assoc_and_unlock(gcwq); | 3507 | |
| 3508 | pool->flags &= ~POOL_DISASSOCIATED; | ||
| 3509 | rebind_workers(pool); | ||
| 3510 | |||
| 3511 | spin_unlock_irq(&pool->lock); | ||
| 3512 | mutex_unlock(&pool->assoc_mutex); | ||
| 3513 | } | ||
| 3627 | break; | 3514 | break; |
| 3628 | } | 3515 | } |
| 3629 | return NOTIFY_OK; | 3516 | return NOTIFY_OK; |
| @@ -3643,7 +3530,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, | |||
| 3643 | switch (action & ~CPU_TASKS_FROZEN) { | 3530 | switch (action & ~CPU_TASKS_FROZEN) { |
| 3644 | case CPU_DOWN_PREPARE: | 3531 | case CPU_DOWN_PREPARE: |
| 3645 | /* unbinding should happen on the local CPU */ | 3532 | /* unbinding should happen on the local CPU */ |
| 3646 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); | 3533 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); |
| 3647 | queue_work_on(cpu, system_highpri_wq, &unbind_work); | 3534 | queue_work_on(cpu, system_highpri_wq, &unbind_work); |
| 3648 | flush_work(&unbind_work); | 3535 | flush_work(&unbind_work); |
| 3649 | break; | 3536 | break; |
| @@ -3696,10 +3583,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
| 3696 | * | 3583 | * |
| 3697 | * Start freezing workqueues. After this function returns, all freezable | 3584 | * Start freezing workqueues. After this function returns, all freezable |
| 3698 | * workqueues will queue new works to their frozen_works list instead of | 3585 | * workqueues will queue new works to their frozen_works list instead of |
| 3699 | * gcwq->worklist. | 3586 | * pool->worklist. |
| 3700 | * | 3587 | * |
| 3701 | * CONTEXT: | 3588 | * CONTEXT: |
| 3702 | * Grabs and releases workqueue_lock and gcwq->lock's. | 3589 | * Grabs and releases workqueue_lock and pool->lock's. |
| 3703 | */ | 3590 | */ |
| 3704 | void freeze_workqueues_begin(void) | 3591 | void freeze_workqueues_begin(void) |
| 3705 | { | 3592 | { |
| @@ -3710,23 +3597,26 @@ void freeze_workqueues_begin(void) | |||
| 3710 | BUG_ON(workqueue_freezing); | 3597 | BUG_ON(workqueue_freezing); |
| 3711 | workqueue_freezing = true; | 3598 | workqueue_freezing = true; |
| 3712 | 3599 | ||
| 3713 | for_each_gcwq_cpu(cpu) { | 3600 | for_each_wq_cpu(cpu) { |
| 3714 | struct global_cwq *gcwq = get_gcwq(cpu); | 3601 | struct worker_pool *pool; |
| 3715 | struct workqueue_struct *wq; | 3602 | struct workqueue_struct *wq; |
| 3716 | 3603 | ||
| 3717 | spin_lock_irq(&gcwq->lock); | 3604 | for_each_std_worker_pool(pool, cpu) { |
| 3605 | spin_lock_irq(&pool->lock); | ||
| 3718 | 3606 | ||
| 3719 | BUG_ON(gcwq->flags & GCWQ_FREEZING); | 3607 | WARN_ON_ONCE(pool->flags & POOL_FREEZING); |
| 3720 | gcwq->flags |= GCWQ_FREEZING; | 3608 | pool->flags |= POOL_FREEZING; |
| 3721 | 3609 | ||
| 3722 | list_for_each_entry(wq, &workqueues, list) { | 3610 | list_for_each_entry(wq, &workqueues, list) { |
| 3723 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3611 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3724 | 3612 | ||
| 3725 | if (cwq && wq->flags & WQ_FREEZABLE) | 3613 | if (pwq && pwq->pool == pool && |
| 3726 | cwq->max_active = 0; | 3614 | (wq->flags & WQ_FREEZABLE)) |
| 3727 | } | 3615 | pwq->max_active = 0; |
| 3616 | } | ||
| 3728 | 3617 | ||
| 3729 | spin_unlock_irq(&gcwq->lock); | 3618 | spin_unlock_irq(&pool->lock); |
| 3619 | } | ||
| 3730 | } | 3620 | } |
| 3731 | 3621 | ||
| 3732 | spin_unlock(&workqueue_lock); | 3622 | spin_unlock(&workqueue_lock); |
| @@ -3754,20 +3644,20 @@ bool freeze_workqueues_busy(void) | |||
| 3754 | 3644 | ||
| 3755 | BUG_ON(!workqueue_freezing); | 3645 | BUG_ON(!workqueue_freezing); |
| 3756 | 3646 | ||
| 3757 | for_each_gcwq_cpu(cpu) { | 3647 | for_each_wq_cpu(cpu) { |
| 3758 | struct workqueue_struct *wq; | 3648 | struct workqueue_struct *wq; |
| 3759 | /* | 3649 | /* |
| 3760 | * nr_active is monotonically decreasing. It's safe | 3650 | * nr_active is monotonically decreasing. It's safe |
| 3761 | * to peek without lock. | 3651 | * to peek without lock. |
| 3762 | */ | 3652 | */ |
| 3763 | list_for_each_entry(wq, &workqueues, list) { | 3653 | list_for_each_entry(wq, &workqueues, list) { |
| 3764 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3654 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3765 | 3655 | ||
| 3766 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) | 3656 | if (!pwq || !(wq->flags & WQ_FREEZABLE)) |
| 3767 | continue; | 3657 | continue; |
| 3768 | 3658 | ||
| 3769 | BUG_ON(cwq->nr_active < 0); | 3659 | BUG_ON(pwq->nr_active < 0); |
| 3770 | if (cwq->nr_active) { | 3660 | if (pwq->nr_active) { |
| 3771 | busy = true; | 3661 | busy = true; |
| 3772 | goto out_unlock; | 3662 | goto out_unlock; |
| 3773 | } | 3663 | } |
| @@ -3782,10 +3672,10 @@ out_unlock: | |||
| 3782 | * thaw_workqueues - thaw workqueues | 3672 | * thaw_workqueues - thaw workqueues |
| 3783 | * | 3673 | * |
| 3784 | * Thaw workqueues. Normal queueing is restored and all collected | 3674 | * Thaw workqueues. Normal queueing is restored and all collected |
| 3785 | * frozen works are transferred to their respective gcwq worklists. | 3675 | * frozen works are transferred to their respective pool worklists. |
| 3786 | * | 3676 | * |
| 3787 | * CONTEXT: | 3677 | * CONTEXT: |
| 3788 | * Grabs and releases workqueue_lock and gcwq->lock's. | 3678 | * Grabs and releases workqueue_lock and pool->lock's. |
| 3789 | */ | 3679 | */ |
| 3790 | void thaw_workqueues(void) | 3680 | void thaw_workqueues(void) |
| 3791 | { | 3681 | { |
| @@ -3796,30 +3686,31 @@ void thaw_workqueues(void) | |||
| 3796 | if (!workqueue_freezing) | 3686 | if (!workqueue_freezing) |
| 3797 | goto out_unlock; | 3687 | goto out_unlock; |
| 3798 | 3688 | ||
| 3799 | for_each_gcwq_cpu(cpu) { | 3689 | for_each_wq_cpu(cpu) { |
| 3800 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3801 | struct worker_pool *pool; | 3690 | struct worker_pool *pool; |
| 3802 | struct workqueue_struct *wq; | 3691 | struct workqueue_struct *wq; |
| 3803 | 3692 | ||
| 3804 | spin_lock_irq(&gcwq->lock); | 3693 | for_each_std_worker_pool(pool, cpu) { |
| 3694 | spin_lock_irq(&pool->lock); | ||
| 3805 | 3695 | ||
| 3806 | BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); | 3696 | WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); |
| 3807 | gcwq->flags &= ~GCWQ_FREEZING; | 3697 | pool->flags &= ~POOL_FREEZING; |
| 3808 | 3698 | ||
| 3809 | list_for_each_entry(wq, &workqueues, list) { | 3699 | list_for_each_entry(wq, &workqueues, list) { |
| 3810 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3700 | struct pool_workqueue *pwq = get_pwq(cpu, wq); |
| 3811 | 3701 | ||
| 3812 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) | 3702 | if (!pwq || pwq->pool != pool || |
| 3813 | continue; | 3703 | !(wq->flags & WQ_FREEZABLE)) |
| 3704 | continue; | ||
| 3814 | 3705 | ||
| 3815 | /* restore max_active and repopulate worklist */ | 3706 | /* restore max_active and repopulate worklist */ |
| 3816 | cwq_set_max_active(cwq, wq->saved_max_active); | 3707 | pwq_set_max_active(pwq, wq->saved_max_active); |
| 3817 | } | 3708 | } |
| 3818 | 3709 | ||
| 3819 | for_each_worker_pool(pool, gcwq) | ||
| 3820 | wake_up_worker(pool); | 3710 | wake_up_worker(pool); |
| 3821 | 3711 | ||
| 3822 | spin_unlock_irq(&gcwq->lock); | 3712 | spin_unlock_irq(&pool->lock); |
| 3713 | } | ||
| 3823 | } | 3714 | } |
| 3824 | 3715 | ||
| 3825 | workqueue_freezing = false; | 3716 | workqueue_freezing = false; |
| @@ -3831,60 +3722,56 @@ out_unlock: | |||
| 3831 | static int __init init_workqueues(void) | 3722 | static int __init init_workqueues(void) |
| 3832 | { | 3723 | { |
| 3833 | unsigned int cpu; | 3724 | unsigned int cpu; |
| 3834 | int i; | ||
| 3835 | 3725 | ||
| 3836 | /* make sure we have enough bits for OFFQ CPU number */ | 3726 | /* make sure we have enough bits for OFFQ pool ID */ |
| 3837 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < | 3727 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < |
| 3838 | WORK_CPU_LAST); | 3728 | WORK_CPU_END * NR_STD_WORKER_POOLS); |
| 3839 | 3729 | ||
| 3840 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); | 3730 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |
| 3841 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | 3731 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); |
| 3842 | 3732 | ||
| 3843 | /* initialize gcwqs */ | 3733 | /* initialize CPU pools */ |
| 3844 | for_each_gcwq_cpu(cpu) { | 3734 | for_each_wq_cpu(cpu) { |
| 3845 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3846 | struct worker_pool *pool; | 3735 | struct worker_pool *pool; |
| 3847 | 3736 | ||
| 3848 | spin_lock_init(&gcwq->lock); | 3737 | for_each_std_worker_pool(pool, cpu) { |
| 3849 | gcwq->cpu = cpu; | 3738 | spin_lock_init(&pool->lock); |
| 3850 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3739 | pool->cpu = cpu; |
| 3851 | 3740 | pool->flags |= POOL_DISASSOCIATED; | |
| 3852 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) | ||
| 3853 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); | ||
| 3854 | |||
| 3855 | for_each_worker_pool(pool, gcwq) { | ||
| 3856 | pool->gcwq = gcwq; | ||
| 3857 | INIT_LIST_HEAD(&pool->worklist); | 3741 | INIT_LIST_HEAD(&pool->worklist); |
| 3858 | INIT_LIST_HEAD(&pool->idle_list); | 3742 | INIT_LIST_HEAD(&pool->idle_list); |
| 3743 | hash_init(pool->busy_hash); | ||
| 3859 | 3744 | ||
| 3860 | init_timer_deferrable(&pool->idle_timer); | 3745 | init_timer_deferrable(&pool->idle_timer); |
| 3861 | pool->idle_timer.function = idle_worker_timeout; | 3746 | pool->idle_timer.function = idle_worker_timeout; |
| 3862 | pool->idle_timer.data = (unsigned long)pool; | 3747 | pool->idle_timer.data = (unsigned long)pool; |
| 3863 | 3748 | ||
| 3864 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, | 3749 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, |
| 3865 | (unsigned long)pool); | 3750 | (unsigned long)pool); |
| 3866 | 3751 | ||
| 3867 | mutex_init(&pool->assoc_mutex); | 3752 | mutex_init(&pool->assoc_mutex); |
| 3868 | ida_init(&pool->worker_ida); | 3753 | ida_init(&pool->worker_ida); |
| 3754 | |||
| 3755 | /* alloc pool ID */ | ||
| 3756 | BUG_ON(worker_pool_assign_id(pool)); | ||
| 3869 | } | 3757 | } |
| 3870 | } | 3758 | } |
| 3871 | 3759 | ||
| 3872 | /* create the initial worker */ | 3760 | /* create the initial worker */ |
| 3873 | for_each_online_gcwq_cpu(cpu) { | 3761 | for_each_online_wq_cpu(cpu) { |
| 3874 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3875 | struct worker_pool *pool; | 3762 | struct worker_pool *pool; |
| 3876 | 3763 | ||
| 3877 | if (cpu != WORK_CPU_UNBOUND) | 3764 | for_each_std_worker_pool(pool, cpu) { |
| 3878 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | ||
| 3879 | |||
| 3880 | for_each_worker_pool(pool, gcwq) { | ||
| 3881 | struct worker *worker; | 3765 | struct worker *worker; |
| 3882 | 3766 | ||
| 3767 | if (cpu != WORK_CPU_UNBOUND) | ||
| 3768 | pool->flags &= ~POOL_DISASSOCIATED; | ||
| 3769 | |||
| 3883 | worker = create_worker(pool); | 3770 | worker = create_worker(pool); |
| 3884 | BUG_ON(!worker); | 3771 | BUG_ON(!worker); |
| 3885 | spin_lock_irq(&gcwq->lock); | 3772 | spin_lock_irq(&pool->lock); |
| 3886 | start_worker(worker); | 3773 | start_worker(worker); |
| 3887 | spin_unlock_irq(&gcwq->lock); | 3774 | spin_unlock_irq(&pool->lock); |
| 3888 | } | 3775 | } |
| 3889 | } | 3776 | } |
| 3890 | 3777 | ||
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h new file mode 100644 index 000000000000..07650264ec15 --- /dev/null +++ b/kernel/workqueue_internal.h | |||
| @@ -0,0 +1,65 @@ | |||
| 1 | /* | ||
| 2 | * kernel/workqueue_internal.h | ||
| 3 | * | ||
| 4 | * Workqueue internal header file. Only to be included by workqueue and | ||
| 5 | * core kernel subsystems. | ||
| 6 | */ | ||
| 7 | #ifndef _KERNEL_WORKQUEUE_INTERNAL_H | ||
| 8 | #define _KERNEL_WORKQUEUE_INTERNAL_H | ||
| 9 | |||
| 10 | #include <linux/workqueue.h> | ||
| 11 | #include <linux/kthread.h> | ||
| 12 | |||
| 13 | struct worker_pool; | ||
| 14 | |||
| 15 | /* | ||
| 16 | * The poor guys doing the actual heavy lifting. All on-duty workers are | ||
| 17 | * either serving the manager role, on idle list or on busy hash. For | ||
| 18 | * details on the locking annotation (L, I, X...), refer to workqueue.c. | ||
| 19 | * | ||
| 20 | * Only to be used in workqueue and async. | ||
| 21 | */ | ||
| 22 | struct worker { | ||
| 23 | /* on idle list while idle, on busy hash table while busy */ | ||
| 24 | union { | ||
| 25 | struct list_head entry; /* L: while idle */ | ||
| 26 | struct hlist_node hentry; /* L: while busy */ | ||
| 27 | }; | ||
| 28 | |||
| 29 | struct work_struct *current_work; /* L: work being processed */ | ||
| 30 | work_func_t current_func; /* L: current_work's fn */ | ||
| 31 | struct pool_workqueue *current_pwq; /* L: current_work's pwq */ | ||
| 32 | struct list_head scheduled; /* L: scheduled works */ | ||
| 33 | struct task_struct *task; /* I: worker task */ | ||
| 34 | struct worker_pool *pool; /* I: the associated pool */ | ||
| 35 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | ||
| 36 | unsigned long last_active; /* L: last active timestamp */ | ||
| 37 | unsigned int flags; /* X: flags */ | ||
| 38 | int id; /* I: worker id */ | ||
| 39 | |||
| 40 | /* for rebinding worker to CPU */ | ||
| 41 | struct work_struct rebind_work; /* L: for busy worker */ | ||
| 42 | |||
| 43 | /* used only by rescuers to point to the target workqueue */ | ||
| 44 | struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ | ||
| 45 | }; | ||
| 46 | |||
| 47 | /** | ||
| 48 | * current_wq_worker - return struct worker if %current is a workqueue worker | ||
| 49 | */ | ||
| 50 | static inline struct worker *current_wq_worker(void) | ||
| 51 | { | ||
| 52 | if (current->flags & PF_WQ_WORKER) | ||
| 53 | return kthread_data(current); | ||
| 54 | return NULL; | ||
| 55 | } | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Scheduler hooks for concurrency managed workqueue. Only to be used from | ||
| 59 | * sched.c and workqueue.c. | ||
| 60 | */ | ||
| 61 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); | ||
| 62 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
| 63 | unsigned int cpu); | ||
| 64 | |||
| 65 | #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ | ||
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h deleted file mode 100644 index 2d10fc98dc79..000000000000 --- a/kernel/workqueue_sched.h +++ /dev/null | |||
| @@ -1,9 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * kernel/workqueue_sched.h | ||
| 3 | * | ||
| 4 | * Scheduler hooks for concurrency managed workqueue. Only to be | ||
| 5 | * included from sched.c and workqueue.c. | ||
| 6 | */ | ||
| 7 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); | ||
| 8 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
| 9 | unsigned int cpu); | ||
