aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2011-03-07 18:55:06 -0500
committerJames Morris <jmorris@namei.org>2011-03-07 18:55:06 -0500
commit1cc26bada9f6807814806db2f0d78792eecdac71 (patch)
tree5509b5139db04af6c13db0a580c84116a4a54039 /kernel
parenteae61f3c829439f8f9121b5cd48a14be04df451f (diff)
parent214d93b02c4fe93638ad268613c9702a81ed9192 (diff)
Merge branch 'master'; commit 'v2.6.38-rc7' into next
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c45
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/cred.c16
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/fork.c41
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c62
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/handle.c111
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/irqdesc.c51
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/latencytop.c23
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/module.c16
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c65
-rw-r--r--kernel/perf_event.c182
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/snapshot.c7
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/printk.c184
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcutiny.c3
-rw-r--r--kernel/sched.c28
-rw-r--r--kernel/sched_autogroup.c32
-rw-r--r--kernel/sched_autogroup.h4
-rw-r--r--kernel/sched_debug.c42
-rw-r--r--kernel/sched_fair.c126
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/smp.c75
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/srcu.c15
-rw-r--r--kernel/sys.c9
-rw-r--r--kernel/sysctl.c34
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clocksource.c10
-rw-r--r--kernel/time/ntp.c425
-rw-r--r--kernel/time/tick-broadcast.c10
-rw-r--r--kernel/time/tick-common.c6
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c47
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c60
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_export.c6
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_syscalls.c52
-rw-r--r--kernel/tracepoint.c31
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/watchdog.c53
-rw-r--r--kernel/workqueue.c57
73 files changed, 1425 insertions, 885 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
46obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 46obj-$(CONFIG_SMP) += smp.o
47ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
48obj-y += up.o 48obj-y += up.o
49endif 49endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 105obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
121# config_data.h contains the same information as ikconfig.h but gzipped. 122# config_data.h contains the same information as ikconfig.h but gzipped.
122# Info from config_data can be extracted from /proc/config* 123# Info from config_data can be extracted from /proc/config*
123targets += config_data.gz 124targets += config_data.gz
124$(obj)/config_data.gz: .config FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
125 $(call if_changed,gzip) 126 $(call if_changed,gzip)
126 127
127quiet_cmd_ikconfiggz = IKCFG $@ 128quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd disappeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
405 /* we might get lucky and get this in the next auditd */ 405 /* we might get lucky and get this in the next auditd */
406 audit_hold_skb(skb); 406 audit_hold_skb(skb);
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..9e9385f132c8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
306 BUG(); 306 BUG();
307 } 307 }
308 308
309 if (security_capable(cap) == 0) { 309 if (security_capable(current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
311 return 1; 311 return 1;
312 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 51cddc11cd85..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -763,9 +763,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
763 * -> cgroup_mkdir. 763 * -> cgroup_mkdir.
764 */ 764 */
765 765
766static struct dentry *cgroup_lookup(struct inode *dir,
767 struct dentry *dentry, struct nameidata *nd);
768static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
767static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
769static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 768static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
770static int cgroup_populate_dir(struct cgroup *cgrp); 769static int cgroup_populate_dir(struct cgroup *cgrp);
771static const struct inode_operations cgroup_dir_inode_operations; 770static const struct inode_operations cgroup_dir_inode_operations;
@@ -862,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
862 iput(inode); 861 iput(inode);
863} 862}
864 863
864static int cgroup_delete(const struct dentry *d)
865{
866 return 1;
867}
868
865static void remove_dir(struct dentry *d) 869static void remove_dir(struct dentry *d)
866{ 870{
867 struct dentry *parent = dget(d->d_parent); 871 struct dentry *parent = dget(d->d_parent);
@@ -912,7 +916,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
912 916
913 parent = dentry->d_parent; 917 parent = dentry->d_parent;
914 spin_lock(&parent->d_lock); 918 spin_lock(&parent->d_lock);
915 spin_lock(&dentry->d_lock); 919 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
916 list_del_init(&dentry->d_u.d_child); 920 list_del_init(&dentry->d_u.d_child);
917 spin_unlock(&dentry->d_lock); 921 spin_unlock(&dentry->d_lock);
918 spin_unlock(&parent->d_lock); 922 spin_unlock(&parent->d_lock);
@@ -1451,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1451 1455
1452static int cgroup_get_rootdir(struct super_block *sb) 1456static int cgroup_get_rootdir(struct super_block *sb)
1453{ 1457{
1458 static const struct dentry_operations cgroup_dops = {
1459 .d_iput = cgroup_diput,
1460 .d_delete = cgroup_delete,
1461 };
1462
1454 struct inode *inode = 1463 struct inode *inode =
1455 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1464 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1456 struct dentry *dentry; 1465 struct dentry *dentry;
@@ -1468,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
1468 return -ENOMEM; 1477 return -ENOMEM;
1469 } 1478 }
1470 sb->s_root = dentry; 1479 sb->s_root = dentry;
1480 /* for everything else we want ->d_op set */
1481 sb->s_d_op = &cgroup_dops;
1471 return 0; 1482 return 0;
1472} 1483}
1473 1484
@@ -2197,6 +2208,14 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2197 .rename = cgroup_rename, 2208 .rename = cgroup_rename,
2198}; 2209};
2199 2210
2211static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2212{
2213 if (dentry->d_name.len > NAME_MAX)
2214 return ERR_PTR(-ENAMETOOLONG);
2215 d_add(dentry, NULL);
2216 return NULL;
2217}
2218
2200/* 2219/*
2201 * Check if a file is a control file 2220 * Check if a file is a control file
2202 */ 2221 */
@@ -2207,26 +2226,6 @@ static inline struct cftype *__file_cft(struct file *file)
2207 return __d_cft(file->f_dentry); 2226 return __d_cft(file->f_dentry);
2208} 2227}
2209 2228
2210static int cgroup_delete_dentry(const struct dentry *dentry)
2211{
2212 return 1;
2213}
2214
2215static struct dentry *cgroup_lookup(struct inode *dir,
2216 struct dentry *dentry, struct nameidata *nd)
2217{
2218 static const struct dentry_operations cgroup_dentry_operations = {
2219 .d_delete = cgroup_delete_dentry,
2220 .d_iput = cgroup_diput,
2221 };
2222
2223 if (dentry->d_name.len > NAME_MAX)
2224 return ERR_PTR(-ENAMETOOLONG);
2225 d_set_d_op(dentry, &cgroup_dentry_operations);
2226 d_add(dentry, NULL);
2227 return NULL;
2228}
2229
2230static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2229static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2231 struct super_block *sb) 2230 struct super_block *sb)
2232{ 2231{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..e92e98189032 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1575 return -ENODEV; 1575 return -ENODEV;
1576 1576
1577 trialcs = alloc_trial_cpuset(cs); 1577 trialcs = alloc_trial_cpuset(cs);
1578 if (!trialcs) 1578 if (!trialcs) {
1579 return -ENOMEM; 1579 retval = -ENOMEM;
1580 goto out;
1581 }
1580 1582
1581 switch (cft->private) { 1583 switch (cft->private) {
1582 case FILE_CPULIST: 1584 case FILE_CPULIST:
@@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1591 } 1593 }
1592 1594
1593 free_trial_cpuset(trialcs); 1595 free_trial_cpuset(trialcs);
1596out:
1594 cgroup_unlock(); 1597 cgroup_unlock();
1595 return retval; 1598 return retval;
1596} 1599}
diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..3a9d6dd53a6c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
252#endif 252#endif
253 253
254 atomic_set(&new->usage, 1); 254 atomic_set(&new->usage, 1);
255#ifdef CONFIG_DEBUG_CREDENTIALS
256 new->magic = CRED_MAGIC;
257#endif
255 258
256 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) 259 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
257 goto error; 260 goto error;
258 261
259#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC;
261#endif
262 return new; 262 return new;
263 263
264error: 264error:
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
657 validate_creds(old); 657 validate_creds(old);
658 658
659 *new = *old; 659 *new = *old;
660 atomic_set(&new->usage, 1);
661 set_cred_subscribers(new, 0);
660 get_uid(new->user); 662 get_uid(new->user);
661 get_group_info(new->group_info); 663 get_group_info(new->group_info);
662 664
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
674 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 676 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
675 goto error; 677 goto error;
676 678
677 atomic_set(&new->usage, 1);
678 set_cred_subscribers(new, 0);
679 put_cred(old); 679 put_cred(old);
680 validate_creds(new); 680 validate_creds(new);
681 return new; 681 return new;
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred)
748 if (cred->magic != CRED_MAGIC) 748 if (cred->magic != CRED_MAGIC)
749 return true; 749 return true;
750#ifdef CONFIG_SECURITY_SELINUX 750#ifdef CONFIG_SECURITY_SELINUX
751 if (selinux_is_enabled()) { 751 /*
752 * cred->security == NULL if security_cred_alloc_blank() or
753 * security_prepare_creds() returned an error.
754 */
755 if (selinux_is_enabled() && cred->security) {
752 if ((unsigned long) cred->security < PAGE_SIZE) 756 if ((unsigned long) cred->security < PAGE_SIZE)
753 return true; 757 return true;
754 if ((*(u32 *)cred->security & 0xffffff00) == 758 if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e729766821..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
2914 } 2914 }
2915} 2915}
2916 2916
2917/* Intialize kdb_printf, breakpoint tables and kdb state */ 2917/* Initialize kdb_printf, breakpoint tables and kdb state */
2918void __init kdb_init(int lvl) 2918void __init kdb_init(int lvl)
2919{ 2919{
2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED; 2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 89c74861a3da..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
994 exit_fs(tsk); 994 exit_fs(tsk);
995 check_stack_usage(); 995 check_stack_usage();
996 exit_thread(); 996 exit_thread();
997
998 /*
999 * Flush inherited counters to the parent - before the parent
1000 * gets woken up by child-exit notifications.
1001 *
1002 * because of cgroup mode, must be called before cgroup_exit()
1003 */
1004 perf_event_exit_task(tsk);
1005
997 cgroup_exit(tsk, 1); 1006 cgroup_exit(tsk, 1);
998 1007
999 if (group_dead) 1008 if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
1007 * FIXME: do that only when needed, using sched_exit tracepoint 1016 * FIXME: do that only when needed, using sched_exit tracepoint
1008 */ 1017 */
1009 flush_ptrace_hw_breakpoint(tsk); 1018 flush_ptrace_hw_breakpoint(tsk);
1010 /*
1011 * Flush inherited counters to the parent - before the parent
1012 * gets woken up by child-exit notifications.
1013 */
1014 perf_event_exit_task(tsk);
1015 1019
1016 exit_notify(tsk, group_dead); 1020 exit_notify(tsk, group_dead);
1017#ifdef CONFIG_NUMA 1021#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h> 68#include <linux/oom.h>
69#include <linux/khugepaged.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
330 retval = ksm_fork(mm, oldmm); 331 retval = ksm_fork(mm, oldmm);
331 if (retval) 332 if (retval)
332 goto out; 333 goto out;
334 retval = khugepaged_fork(mm, oldmm);
335 if (retval)
336 goto out;
333 337
334 prev = NULL; 338 prev = NULL;
335 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
529 mm_free_pgd(mm); 533 mm_free_pgd(mm);
530 destroy_context(mm); 534 destroy_context(mm);
531 mmu_notifier_mm_destroy(mm); 535 mmu_notifier_mm_destroy(mm);
536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537 VM_BUG_ON(mm->pmd_huge_pte);
538#endif
532 free_mm(mm); 539 free_mm(mm);
533} 540}
534EXPORT_SYMBOL_GPL(__mmdrop); 541EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
543 if (atomic_dec_and_test(&mm->mm_users)) { 550 if (atomic_dec_and_test(&mm->mm_users)) {
544 exit_aio(mm); 551 exit_aio(mm);
545 ksm_exit(mm); 552 ksm_exit(mm);
553 khugepaged_exit(mm); /* must run before exit_mmap */
546 exit_mmap(mm); 554 exit_mmap(mm);
547 set_mm_exe_file(mm, NULL); 555 set_mm_exe_file(mm, NULL);
548 if (!list_empty(&mm->mmlist)) { 556 if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
669 mm->token_priority = 0; 677 mm->token_priority = 0;
670 mm->last_interval = 0; 678 mm->last_interval = 0;
671 679
680#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681 mm->pmd_huge_pte = NULL;
682#endif
683
672 if (!mm_init(mm, tsk)) 684 if (!mm_init(mm, tsk))
673 goto fail_nomem; 685 goto fail_nomem;
674 686
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
910 922
911 sig->oom_adj = current->signal->oom_adj; 923 sig->oom_adj = current->signal->oom_adj;
912 sig->oom_score_adj = current->signal->oom_score_adj; 924 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
913 926
914 mutex_init(&sig->cred_guard_mutex); 927 mutex_init(&sig->cred_guard_mutex);
915 928
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
1410 } 1423 }
1411 1424
1412 /* 1425 /*
1413 * We hope to recycle these flags after 2.6.26
1414 */
1415 if (unlikely(clone_flags & CLONE_STOPPED)) {
1416 static int __read_mostly count = 100;
1417
1418 if (count > 0 && printk_ratelimit()) {
1419 char comm[TASK_COMM_LEN];
1420
1421 count--;
1422 printk(KERN_INFO "fork(): process `%s' used deprecated "
1423 "clone flags 0x%lx\n",
1424 get_task_comm(comm, current),
1425 clone_flags & CLONE_STOPPED);
1426 }
1427 }
1428
1429 /*
1430 * When called from kernel_thread, don't do user tracing stuff. 1426 * When called from kernel_thread, don't do user tracing stuff.
1431 */ 1427 */
1432 if (likely(user_mode(regs))) 1428 if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
1464 */ 1460 */
1465 p->flags &= ~PF_STARTING; 1461 p->flags &= ~PF_STARTING;
1466 1462
1467 if (unlikely(clone_flags & CLONE_STOPPED)) { 1463 wake_up_new_task(p, clone_flags);
1468 /*
1469 * We'll start up with an immediate SIGSTOP.
1470 */
1471 sigaddset(&p->pending.signal, SIGSTOP);
1472 set_tsk_thread_flag(p, TIF_SIGPENDING);
1473 __set_task_state(p, TASK_STOPPED);
1474 } else {
1475 wake_up_new_task(p, clone_flags);
1476 }
1477 1464
1478 tracehook_report_clone_complete(trace, regs, 1465 tracehook_report_clone_complete(trace, regs,
1479 clone_flags, nr, p); 1466 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
104 } 104 }
105 105
106 if (should_send_signal(p)) { 106 if (should_send_signal(p)) {
107 if (!signal_pending(p)) 107 fake_signal_wake_up(p);
108 fake_signal_wake_up(p); 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
109 } else if (sig_only) { 114 } else if (sig_only) {
110 return false; 115 return false;
111 } else { 116 } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
233{ 233{
234 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
235 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
236 struct page *page; 236 struct page *page, *page_head;
237 int err; 237 int err;
238 238
239 /* 239 /*
@@ -265,11 +265,46 @@ again:
265 if (err < 0) 265 if (err < 0)
266 return err; 266 return err;
267 267
268 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
269 lock_page(page); 269 page_head = page;
270 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
271 unlock_page(page);
272 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
273 goto again; 308 goto again;
274 } 309 }
275 310
@@ -280,20 +315,20 @@ again:
280 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
281 * the object not the particular process. 316 * the object not the particular process.
282 */ 317 */
283 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
284 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
285 key->private.mm = mm; 320 key->private.mm = mm;
286 key->private.address = address; 321 key->private.address = address;
287 } else { 322 } else {
288 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
289 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
290 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
291 } 326 }
292 327
293 get_futex_key_refs(key); 328 get_futex_key_refs(key);
294 329
295 unlock_page(page); 330 unlock_page(page_head);
296 put_page(page); 331 put_page(page_head);
297 return 0; 332 return 0;
298} 333}
299 334
@@ -791,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
791 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 826 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
792 827
793 /* 828 /*
794 * This happens when we have stolen the lock and the original 829 * It is possible that the next waiter (the one that brought
795 * pending owner did not enqueue itself back on the rt_mutex. 830 * this owner to the kernel) timed out and is no longer
796 * Thats not a tragedy. We know that way, that a lock waiter 831 * waiting on the lock.
797 * is on the fly. We make the futex_q waiter the pending owner.
798 */ 832 */
799 if (!new_owner) 833 if (!new_owner)
800 new_owner = this->task; 834 new_owner = this->task;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 45da2b6920ab..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1745,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1745 } 1745 }
1746 1746
1747 /* 1747 /*
1748 * A NULL parameter means "inifinte" 1748 * A NULL parameter means "infinite"
1749 */ 1749 */
1750 if (!expires) { 1750 if (!expires) {
1751 schedule(); 1751 schedule();
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
9config GENERIC_HARDIRQS 9config GENERIC_HARDIRQS
10 def_bool y 10 def_bool y
11 11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff 12# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED 13config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n 14 def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
118 118
119 return retval; 119 return retval;
120} 120}
121
122#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
123
124#ifdef CONFIG_ENABLE_WARN_DEPRECATED
125# warning __do_IRQ is deprecated. Please convert to proper flow handlers
126#endif
127
128/**
129 * __do_IRQ - original all in one highlevel IRQ handler
130 * @irq: the interrupt number
131 *
132 * __do_IRQ handles all normal device IRQ's (the special
133 * SMP cross-CPU interrupts have their own specific
134 * handlers).
135 *
136 * This is the original x86 implementation which is used for every
137 * interrupt type.
138 */
139unsigned int __do_IRQ(unsigned int irq)
140{
141 struct irq_desc *desc = irq_to_desc(irq);
142 struct irqaction *action;
143 unsigned int status;
144
145 kstat_incr_irqs_this_cpu(irq, desc);
146
147 if (CHECK_IRQ_PER_CPU(desc->status)) {
148 irqreturn_t action_ret;
149
150 /*
151 * No locking required for CPU-local interrupts:
152 */
153 if (desc->irq_data.chip->ack)
154 desc->irq_data.chip->ack(irq);
155 if (likely(!(desc->status & IRQ_DISABLED))) {
156 action_ret = handle_IRQ_event(irq, desc->action);
157 if (!noirqdebug)
158 note_interrupt(irq, desc, action_ret);
159 }
160 desc->irq_data.chip->end(irq);
161 return 1;
162 }
163
164 raw_spin_lock(&desc->lock);
165 if (desc->irq_data.chip->ack)
166 desc->irq_data.chip->ack(irq);
167 /*
168 * REPLAY is when Linux resends an IRQ that was dropped earlier
169 * WAITING is used by probe to mark irqs that are being tested
170 */
171 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
172 status |= IRQ_PENDING; /* we _want_ to handle it */
173
174 /*
175 * If the IRQ is disabled for whatever reason, we cannot
176 * use the action we have.
177 */
178 action = NULL;
179 if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
180 action = desc->action;
181 status &= ~IRQ_PENDING; /* we commit to handling */
182 status |= IRQ_INPROGRESS; /* we are handling it */
183 }
184 desc->status = status;
185
186 /*
187 * If there is no IRQ handler or it was disabled, exit early.
188 * Since we set PENDING, if another processor is handling
189 * a different instance of this same irq, the other processor
190 * will take care of it.
191 */
192 if (unlikely(!action))
193 goto out;
194
195 /*
196 * Edge triggered interrupts need to remember
197 * pending events.
198 * This applies to any hw interrupts that allow a second
199 * instance of the same irq to arrive while we are in do_IRQ
200 * or in the handler. But the code here only handles the _second_
201 * instance of the irq, not the third or fourth. So it is mostly
202 * useful for irq hardware that does not mask cleanly in an
203 * SMP environment.
204 */
205 for (;;) {
206 irqreturn_t action_ret;
207
208 raw_spin_unlock(&desc->lock);
209
210 action_ret = handle_IRQ_event(irq, action);
211 if (!noirqdebug)
212 note_interrupt(irq, desc, action_ret);
213
214 raw_spin_lock(&desc->lock);
215 if (likely(!(desc->status & IRQ_PENDING)))
216 break;
217 desc->status &= ~IRQ_PENDING;
218 }
219 desc->status &= ~IRQ_INPROGRESS;
220
221out:
222 /*
223 * The ->end() handler has to deal with interrupts which got
224 * disabled while the handler was running.
225 */
226 desc->irq_data.chip->end(irq);
227 raw_spin_unlock(&desc->lock);
228
229 return 1;
230}
231#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..99c3bc8a6fb4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -3,6 +3,12 @@
3 */ 3 */
4#include <linux/irqdesc.h> 4#include <linux/irqdesc.h>
5 5
6#ifdef CONFIG_SPARSE_IRQ
7# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
8#else
9# define IRQ_BITMAP_BITS NR_IRQS
10#endif
11
6extern int noirqdebug; 12extern int noirqdebug;
7 13
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 14#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..2039bea31bdf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{ 74{
75 int cpu;
76
75 desc->irq_data.irq = irq; 77 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip; 78 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
83 desc->irq_count = 0; 85 desc->irq_count = 0;
84 desc->irqs_unhandled = 0; 86 desc->irqs_unhandled = 0;
85 desc->name = NULL; 87 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); 88 for_each_possible_cpu(cpu)
89 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
87 desc_smp_init(desc, node); 90 desc_smp_init(desc, node);
88} 91}
89 92
@@ -91,7 +94,7 @@ int nr_irqs = NR_IRQS;
91EXPORT_SYMBOL_GPL(nr_irqs); 94EXPORT_SYMBOL_GPL(nr_irqs);
92 95
93static DEFINE_MUTEX(sparse_irq_lock); 96static DEFINE_MUTEX(sparse_irq_lock);
94static DECLARE_BITMAP(allocated_irqs, NR_IRQS); 97static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
95 98
96#ifdef CONFIG_SPARSE_IRQ 99#ifdef CONFIG_SPARSE_IRQ
97 100
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
133 if (!desc) 136 if (!desc)
134 return NULL; 137 return NULL;
135 /* allocate based on nr_cpu_ids */ 138 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), 139 desc->kstat_irqs = alloc_percpu(unsigned int);
137 gfp, node);
138 if (!desc->kstat_irqs) 140 if (!desc->kstat_irqs)
139 goto err_desc; 141 goto err_desc;
140 142
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
149 return desc; 151 return desc;
150 152
151err_kstat: 153err_kstat:
152 kfree(desc->kstat_irqs); 154 free_percpu(desc->kstat_irqs);
153err_desc: 155err_desc:
154 kfree(desc); 156 kfree(desc);
155 return NULL; 157 return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
166 mutex_unlock(&sparse_irq_lock); 168 mutex_unlock(&sparse_irq_lock);
167 169
168 free_masks(desc); 170 free_masks(desc);
169 kfree(desc->kstat_irqs); 171 free_percpu(desc->kstat_irqs);
170 kfree(desc); 172 kfree(desc);
171} 173}
172 174
@@ -215,6 +217,15 @@ int __init early_irq_init(void)
215 initcnt = arch_probe_nr_irqs(); 217 initcnt = arch_probe_nr_irqs();
216 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); 218 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
217 219
220 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
221 nr_irqs = IRQ_BITMAP_BITS;
222
223 if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
224 initcnt = IRQ_BITMAP_BITS;
225
226 if (initcnt > nr_irqs)
227 nr_irqs = initcnt;
228
218 for (i = 0; i < initcnt; i++) { 229 for (i = 0; i < initcnt; i++) {
219 desc = alloc_desc(i, node); 230 desc = alloc_desc(i, node);
220 set_bit(i, allocated_irqs); 231 set_bit(i, allocated_irqs);
@@ -234,7 +245,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
234 } 245 }
235}; 246};
236 247
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void) 248int __init early_irq_init(void)
239{ 249{
240 int count, i, node = first_online_node; 250 int count, i, node = first_online_node;
@@ -250,7 +260,8 @@ int __init early_irq_init(void)
250 for (i = 0; i < count; i++) { 260 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i; 261 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip; 262 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i]; 263 /* TODO : do this allocation on-demand ... */
264 desc[i].kstat_irqs = alloc_percpu(unsigned int);
254 alloc_masks(desc + i, GFP_KERNEL, node); 265 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node); 266 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 267 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +286,22 @@ static void free_desc(unsigned int irq)
275 286
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 287static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{ 288{
289#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
290 struct irq_desc *desc;
291 unsigned int i;
292
293 for (i = 0; i < cnt; i++) {
294 desc = irq_to_desc(start + i);
295 if (desc && !desc->kstat_irqs) {
296 unsigned int __percpu *stats = alloc_percpu(unsigned int);
297
298 if (!stats)
299 return -1;
300 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
301 free_percpu(stats);
302 }
303 }
304#endif
278 return start; 305 return start;
279} 306}
280#endif /* !CONFIG_SPARSE_IRQ */ 307#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +418,9 @@ void dynamic_irq_cleanup(unsigned int irq)
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 418unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{ 419{
393 struct irq_desc *desc = irq_to_desc(irq); 420 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 421
422 return desc && desc->kstat_irqs ?
423 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
395} 424}
396 425
397#ifdef CONFIG_GENERIC_HARDIRQS 426#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +430,10 @@ unsigned int kstat_irqs(unsigned int irq)
401 int cpu; 430 int cpu;
402 int sum = 0; 431 int sum = 0;
403 432
404 if (!desc) 433 if (!desc || !desc->kstat_irqs)
405 return 0; 434 return 0;
406 for_each_possible_cpu(cpu) 435 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu]; 436 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
408 return sum; 437 return sum;
409} 438}
410#endif /* CONFIG_GENERIC_HARDIRQS */ 439#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..9033c1c70828 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1100,7 +1100,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1100 if (retval) 1100 if (retval)
1101 kfree(action); 1101 kfree(action);
1102 1102
1103#ifdef CONFIG_DEBUG_SHIRQ 1103#ifdef CONFIG_DEBUG_SHIRQ_FIXME
1104 if (!retval && (irqflags & IRQF_SHARED)) { 1104 if (!retval && (irqflags & IRQF_SHARED)) {
1105 /* 1105 /*
1106 * It's a shared IRQ -- the driver ought to be prepared for it 1106 * It's a shared IRQ -- the driver ought to be prepared for it
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d2541940480..441fd629ff04 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
56void move_native_irq(int irq) 56void move_native_irq(int irq)
57{ 57{
58 struct irq_desc *desc = irq_to_desc(irq); 58 struct irq_desc *desc = irq_to_desc(irq);
59 bool masked;
59 60
60 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 61 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
61 return; 62 return;
@@ -63,8 +64,15 @@ void move_native_irq(int irq)
63 if (unlikely(desc->status & IRQ_DISABLED)) 64 if (unlikely(desc->status & IRQ_DISABLED))
64 return; 65 return;
65 66
66 desc->irq_data.chip->irq_mask(&desc->irq_data); 67 /*
68 * Be careful vs. already masked interrupts. If this is a
69 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm.
71 */
72 masked = desc->status & IRQ_MASKED;
73 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data);
67 move_masked_irq(irq); 75 move_masked_irq(irq);
68 desc->irq_data.chip->irq_unmask(&desc->irq_data); 76 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data);
69} 78}
70
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..dc49358b73fa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
23#ifdef CONFIG_HARDIRQS_SW_RESEND 23#ifdef CONFIG_HARDIRQS_SW_RESEND
24 24
25/* Bitmap to handle software resend of interrupts: */ 25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS); 26static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
27 27
28/* 28/*
29 * Run software resends of IRQ's 29 * Run software resends of IRQ's
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
163 * just verifies it is an address we can use. 163 * just verifies it is an address we can use.
164 * 164 *
165 * Since the kernel does everything in page size chunks ensure 165 * Since the kernel does everything in page size chunks ensure
166 * the destination addreses are page aligned. Too many 166 * the destination addresses are page aligned. Too many
167 * special cases crop of when we don't do this. The most 167 * special cases crop of when we don't do this. The most
168 * insidious is getting overlapping destination addresses 168 * insidious is getting overlapping destination addresses
169 * simply because addresses are changed to page size 169 * simply because addresses are changed to page size
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
241 seq_puts(m, "Latency Top version : v0.1\n"); 241 seq_puts(m, "Latency Top version : v0.1\n");
242 242
243 for (i = 0; i < MAXLR; i++) { 243 for (i = 0; i < MAXLR; i++) {
244 if (latency_record[i].backtrace[0]) { 244 struct latency_record *lr = &latency_record[i];
245
246 if (lr->backtrace[0]) {
245 int q; 247 int q;
246 seq_printf(m, "%i %lu %lu ", 248 seq_printf(m, "%i %lu %lu",
247 latency_record[i].count, 249 lr->count, lr->time, lr->max);
248 latency_record[i].time,
249 latency_record[i].max);
250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
251 char sym[KSYM_SYMBOL_LEN]; 251 unsigned long bt = lr->backtrace[q];
252 char *c; 252 if (!bt)
253 if (!latency_record[i].backtrace[q])
254 break; 253 break;
255 if (latency_record[i].backtrace[q] == ULONG_MAX) 254 if (bt == ULONG_MAX)
256 break; 255 break;
257 sprint_symbol(sym, latency_record[i].backtrace[q]); 256 seq_printf(m, " %ps", (void *)bt);
258 c = strchr(sym, '+');
259 if (c)
260 *c = 0;
261 seq_printf(m, "%s ", sym);
262 } 257 }
263 seq_printf(m, "\n"); 258 seq_printf(m, "\n");
264 } 259 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2292} 2292}
2293 2293
2294/* 2294/*
2295 * Debugging helper: via this flag we know that we are in
2296 * 'early bootup code', and will warn about any invalid irqs-on event:
2297 */
2298static int early_boot_irqs_enabled;
2299
2300void early_boot_irqs_off(void)
2301{
2302 early_boot_irqs_enabled = 0;
2303}
2304
2305void early_boot_irqs_on(void)
2306{
2307 early_boot_irqs_enabled = 1;
2308}
2309
2310/*
2311 * Hardirqs will be enabled: 2295 * Hardirqs will be enabled:
2312 */ 2296 */
2313void trace_hardirqs_on_caller(unsigned long ip) 2297void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2319 if (unlikely(!debug_locks || current->lockdep_recursion)) 2303 if (unlikely(!debug_locks || current->lockdep_recursion))
2320 return; 2304 return;
2321 2305
2322 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) 2306 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2323 return; 2307 return;
2324 2308
2325 if (unlikely(curr->hardirqs_enabled)) { 2309 if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/module.c b/kernel/module.c
index 34e00b708fad..efa290ea94bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2460#endif 2460#endif
2461 2461
2462#ifdef CONFIG_TRACEPOINTS 2462#ifdef CONFIG_TRACEPOINTS
2463 mod->tracepoints = section_objs(info, "__tracepoints", 2463 mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
2464 sizeof(*mod->tracepoints), 2464 sizeof(*mod->tracepoints_ptrs),
2465 &mod->num_tracepoints); 2465 &mod->num_tracepoints);
2466#endif 2466#endif
2467#ifdef HAVE_JUMP_LABEL 2467#ifdef HAVE_JUMP_LABEL
2468 mod->jump_entries = section_objs(info, "__jump_table", 2468 mod->jump_entries = section_objs(info, "__jump_table",
@@ -3393,7 +3393,7 @@ void module_layout(struct module *mod,
3393 struct modversion_info *ver, 3393 struct modversion_info *ver,
3394 struct kernel_param *kp, 3394 struct kernel_param *kp,
3395 struct kernel_symbol *ks, 3395 struct kernel_symbol *ks,
3396 struct tracepoint *tp) 3396 struct tracepoint * const *tp)
3397{ 3397{
3398} 3398}
3399EXPORT_SYMBOL(module_layout); 3399EXPORT_SYMBOL(module_layout);
@@ -3407,8 +3407,8 @@ void module_update_tracepoints(void)
3407 mutex_lock(&module_mutex); 3407 mutex_lock(&module_mutex);
3408 list_for_each_entry(mod, &modules, list) 3408 list_for_each_entry(mod, &modules, list)
3409 if (!mod->taints) 3409 if (!mod->taints)
3410 tracepoint_update_probe_range(mod->tracepoints, 3410 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3411 mod->tracepoints + mod->num_tracepoints); 3411 mod->tracepoints_ptrs + mod->num_tracepoints);
3412 mutex_unlock(&module_mutex); 3412 mutex_unlock(&module_mutex);
3413} 3413}
3414 3414
@@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3432 else if (iter_mod > iter->module) 3432 else if (iter_mod > iter->module)
3433 iter->tracepoint = NULL; 3433 iter->tracepoint = NULL;
3434 found = tracepoint_get_iter_range(&iter->tracepoint, 3434 found = tracepoint_get_iter_range(&iter->tracepoint,
3435 iter_mod->tracepoints, 3435 iter_mod->tracepoints_ptrs,
3436 iter_mod->tracepoints 3436 iter_mod->tracepoints_ptrs
3437 + iter_mod->num_tracepoints); 3437 + iter_mod->num_tracepoints);
3438 if (found) { 3438 if (found) {
3439 iter->module = iter_mod; 3439 iter->module = iter_mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout;
37EXPORT_SYMBOL_GPL(panic_timeout);
37 38
38ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
39 40
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
719 params[i].ops->free(params[i].arg); 719 params[i].ops->free(params[i].arg);
720} 720}
721 721
722static void __init kernel_add_sysfs_param(const char *name, 722static struct module_kobject * __init locate_module_kobject(const char *name)
723 struct kernel_param *kparam,
724 unsigned int name_skip)
725{ 723{
726 struct module_kobject *mk; 724 struct module_kobject *mk;
727 struct kobject *kobj; 725 struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
729 727
730 kobj = kset_find_obj(module_kset, name); 728 kobj = kset_find_obj(module_kset, name);
731 if (kobj) { 729 if (kobj) {
732 /* We already have one. Remove params so we can add more. */
733 mk = to_module_kobject(kobj); 730 mk = to_module_kobject(kobj);
734 /* We need to remove it before adding parameters. */
735 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
736 } else { 731 } else {
737 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 732 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
738 BUG_ON(!mk); 733 BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
743 "%s", name); 738 "%s", name);
744 if (err) { 739 if (err) {
745 kobject_put(&mk->kobj); 740 kobject_put(&mk->kobj);
746 printk(KERN_ERR "Module '%s' failed add to sysfs, " 741 printk(KERN_ERR
747 "error number %d\n", name, err); 742 "Module '%s' failed add to sysfs, error number %d\n",
748 printk(KERN_ERR "The system will be unstable now.\n"); 743 name, err);
749 return; 744 printk(KERN_ERR
745 "The system will be unstable now.\n");
746 return NULL;
750 } 747 }
751 /* So that exit path is even. */ 748
749 /* So that we hold reference in both cases. */
752 kobject_get(&mk->kobj); 750 kobject_get(&mk->kobj);
753 } 751 }
754 752
753 return mk;
754}
755
756static void __init kernel_add_sysfs_param(const char *name,
757 struct kernel_param *kparam,
758 unsigned int name_skip)
759{
760 struct module_kobject *mk;
761 int err;
762
763 mk = locate_module_kobject(name);
764 if (!mk)
765 return;
766
767 /* We need to remove old parameters before adding more. */
768 if (mk->mp)
769 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
770
755 /* These should not fail at boot. */ 771 /* These should not fail at boot. */
756 err = add_sysfs_param(mk, kparam, kparam->name + name_skip); 772 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
757 BUG_ON(err); 773 BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
796 } 812 }
797} 813}
798 814
815ssize_t __modver_version_show(struct module_attribute *mattr,
816 struct module *mod, char *buf)
817{
818 struct module_version_attribute *vattr =
819 container_of(mattr, struct module_version_attribute, mattr);
820
821 return sprintf(buf, "%s\n", vattr->version);
822}
823
824extern struct module_version_attribute __start___modver[], __stop___modver[];
825
826static void __init version_sysfs_builtin(void)
827{
828 const struct module_version_attribute *vattr;
829 struct module_kobject *mk;
830 int err;
831
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
833 mk = locate_module_kobject(vattr->module_name);
834 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
836 kobject_uevent(&mk->kobj, KOBJ_ADD);
837 kobject_put(&mk->kobj);
838 }
839 }
840}
799 841
800/* module-related sysfs stuff */ 842/* module-related sysfs stuff */
801 843
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
875 } 917 }
876 module_sysfs_initialized = 1; 918 module_sysfs_initialized = 1;
877 919
920 version_sysfs_builtin();
878 param_sysfs_builtin(); 921 param_sysfs_builtin();
879 922
880 return 0; 923 return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 11847bf1e8cc..656222fcf767 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,12 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41enum event_type_t {
42 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45};
46
41atomic_t perf_task_events __read_mostly; 47atomic_t perf_task_events __read_mostly;
42static atomic_t nr_mmap_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly;
43static atomic_t nr_comm_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly;
@@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
65 71
66static atomic64_t perf_event_id; 72static atomic64_t perf_event_id;
67 73
74static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type);
76
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type);
79
68void __weak perf_event_print_debug(void) { } 80void __weak perf_event_print_debug(void) { }
69 81
70extern __weak const char *perf_pmu_name(void) 82extern __weak const char *perf_pmu_name(void)
@@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
72 return "pmu"; 84 return "pmu";
73} 85}
74 86
87static inline u64 perf_clock(void)
88{
89 return local_clock();
90}
91
75void perf_pmu_disable(struct pmu *pmu) 92void perf_pmu_disable(struct pmu *pmu)
76{ 93{
77 int *count = this_cpu_ptr(pmu->pmu_disable_count); 94 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
240 put_ctx(ctx); 257 put_ctx(ctx);
241} 258}
242 259
243static inline u64 perf_clock(void)
244{
245 return local_clock();
246}
247
248/* 260/*
249 * Update the record of the current time in a context. 261 * Update the record of the current time in a context.
250 */ 262 */
@@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
256 ctx->timestamp = now; 268 ctx->timestamp = now;
257} 269}
258 270
271static u64 perf_event_time(struct perf_event *event)
272{
273 struct perf_event_context *ctx = event->ctx;
274 return ctx ? ctx->time : 0;
275}
276
259/* 277/*
260 * Update the total_time_enabled and total_time_running fields for a event. 278 * Update the total_time_enabled and total_time_running fields for a event.
261 */ 279 */
@@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event)
269 return; 287 return;
270 288
271 if (ctx->is_active) 289 if (ctx->is_active)
272 run_end = ctx->time; 290 run_end = perf_event_time(event);
273 else 291 else
274 run_end = event->tstamp_stopped; 292 run_end = event->tstamp_stopped;
275 293
@@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event)
278 if (event->state == PERF_EVENT_STATE_INACTIVE) 296 if (event->state == PERF_EVENT_STATE_INACTIVE)
279 run_end = event->tstamp_stopped; 297 run_end = event->tstamp_stopped;
280 else 298 else
281 run_end = ctx->time; 299 run_end = perf_event_time(event);
282 300
283 event->total_time_running = run_end - event->tstamp_running; 301 event->total_time_running = run_end - event->tstamp_running;
284} 302}
@@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event,
534 struct perf_cpu_context *cpuctx, 552 struct perf_cpu_context *cpuctx,
535 struct perf_event_context *ctx) 553 struct perf_event_context *ctx)
536{ 554{
555 u64 tstamp = perf_event_time(event);
537 u64 delta; 556 u64 delta;
538 /* 557 /*
539 * An event which could not be activated because of 558 * An event which could not be activated because of
@@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event,
545 && !event_filter_match(event)) { 564 && !event_filter_match(event)) {
546 delta = ctx->time - event->tstamp_stopped; 565 delta = ctx->time - event->tstamp_stopped;
547 event->tstamp_running += delta; 566 event->tstamp_running += delta;
548 event->tstamp_stopped = ctx->time; 567 event->tstamp_stopped = tstamp;
549 } 568 }
550 569
551 if (event->state != PERF_EVENT_STATE_ACTIVE) 570 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event,
556 event->pending_disable = 0; 575 event->pending_disable = 0;
557 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
558 } 577 }
559 event->tstamp_stopped = ctx->time; 578 event->tstamp_stopped = tstamp;
560 event->pmu->del(event, 0); 579 event->pmu->del(event, 0);
561 event->oncpu = -1; 580 event->oncpu = -1;
562 581
@@ -763,16 +782,33 @@ retry:
763 raw_spin_unlock_irq(&ctx->lock); 782 raw_spin_unlock_irq(&ctx->lock);
764} 783}
765 784
785#define MAX_INTERRUPTS (~0ULL)
786
787static void perf_log_throttle(struct perf_event *event, int enable);
788
766static int 789static int
767event_sched_in(struct perf_event *event, 790event_sched_in(struct perf_event *event,
768 struct perf_cpu_context *cpuctx, 791 struct perf_cpu_context *cpuctx,
769 struct perf_event_context *ctx) 792 struct perf_event_context *ctx)
770{ 793{
794 u64 tstamp = perf_event_time(event);
795
771 if (event->state <= PERF_EVENT_STATE_OFF) 796 if (event->state <= PERF_EVENT_STATE_OFF)
772 return 0; 797 return 0;
773 798
774 event->state = PERF_EVENT_STATE_ACTIVE; 799 event->state = PERF_EVENT_STATE_ACTIVE;
775 event->oncpu = smp_processor_id(); 800 event->oncpu = smp_processor_id();
801
802 /*
803 * Unthrottle events, since we scheduled we might have missed several
804 * ticks already, also for a heavily scheduling task there is little
805 * guarantee it'll get a tick in a timely manner.
806 */
807 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
808 perf_log_throttle(event, 1);
809 event->hw.interrupts = 0;
810 }
811
776 /* 812 /*
777 * The new state must be visible before we turn it on in the hardware: 813 * The new state must be visible before we turn it on in the hardware:
778 */ 814 */
@@ -784,9 +820,9 @@ event_sched_in(struct perf_event *event,
784 return -EAGAIN; 820 return -EAGAIN;
785 } 821 }
786 822
787 event->tstamp_running += ctx->time - event->tstamp_stopped; 823 event->tstamp_running += tstamp - event->tstamp_stopped;
788 824
789 event->shadow_ctx_time = ctx->time - ctx->timestamp; 825 event->shadow_ctx_time = tstamp - ctx->timestamp;
790 826
791 if (!is_software_event(event)) 827 if (!is_software_event(event))
792 cpuctx->active_oncpu++; 828 cpuctx->active_oncpu++;
@@ -898,11 +934,13 @@ static int group_can_go_on(struct perf_event *event,
898static void add_event_to_ctx(struct perf_event *event, 934static void add_event_to_ctx(struct perf_event *event,
899 struct perf_event_context *ctx) 935 struct perf_event_context *ctx)
900{ 936{
937 u64 tstamp = perf_event_time(event);
938
901 list_add_event(event, ctx); 939 list_add_event(event, ctx);
902 perf_group_attach(event); 940 perf_group_attach(event);
903 event->tstamp_enabled = ctx->time; 941 event->tstamp_enabled = tstamp;
904 event->tstamp_running = ctx->time; 942 event->tstamp_running = tstamp;
905 event->tstamp_stopped = ctx->time; 943 event->tstamp_stopped = tstamp;
906} 944}
907 945
908/* 946/*
@@ -937,7 +975,7 @@ static void __perf_install_in_context(void *info)
937 975
938 add_event_to_ctx(event, ctx); 976 add_event_to_ctx(event, ctx);
939 977
940 if (event->cpu != -1 && event->cpu != smp_processor_id()) 978 if (!event_filter_match(event))
941 goto unlock; 979 goto unlock;
942 980
943 /* 981 /*
@@ -1042,14 +1080,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1042 struct perf_event_context *ctx) 1080 struct perf_event_context *ctx)
1043{ 1081{
1044 struct perf_event *sub; 1082 struct perf_event *sub;
1083 u64 tstamp = perf_event_time(event);
1045 1084
1046 event->state = PERF_EVENT_STATE_INACTIVE; 1085 event->state = PERF_EVENT_STATE_INACTIVE;
1047 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1086 event->tstamp_enabled = tstamp - event->total_time_enabled;
1048 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1087 list_for_each_entry(sub, &event->sibling_list, group_entry) {
1049 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1088 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1050 sub->tstamp_enabled = 1089 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1051 ctx->time - sub->total_time_enabled;
1052 }
1053 } 1090 }
1054} 1091}
1055 1092
@@ -1082,7 +1119,7 @@ static void __perf_event_enable(void *info)
1082 goto unlock; 1119 goto unlock;
1083 __perf_event_mark_enabled(event, ctx); 1120 __perf_event_mark_enabled(event, ctx);
1084 1121
1085 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1122 if (!event_filter_match(event))
1086 goto unlock; 1123 goto unlock;
1087 1124
1088 /* 1125 /*
@@ -1193,12 +1230,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1193 return 0; 1230 return 0;
1194} 1231}
1195 1232
1196enum event_type_t {
1197 EVENT_FLEXIBLE = 0x1,
1198 EVENT_PINNED = 0x2,
1199 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1200};
1201
1202static void ctx_sched_out(struct perf_event_context *ctx, 1233static void ctx_sched_out(struct perf_event_context *ctx,
1203 struct perf_cpu_context *cpuctx, 1234 struct perf_cpu_context *cpuctx,
1204 enum event_type_t event_type) 1235 enum event_type_t event_type)
@@ -1435,7 +1466,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1466 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1436 if (event->state <= PERF_EVENT_STATE_OFF) 1467 if (event->state <= PERF_EVENT_STATE_OFF)
1437 continue; 1468 continue;
1438 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1469 if (!event_filter_match(event))
1439 continue; 1470 continue;
1440 1471
1441 if (group_can_go_on(event, cpuctx, 1)) 1472 if (group_can_go_on(event, cpuctx, 1))
@@ -1467,7 +1498,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1467 * Listen to the 'cpu' scheduling filter constraint 1498 * Listen to the 'cpu' scheduling filter constraint
1468 * of events: 1499 * of events:
1469 */ 1500 */
1470 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1501 if (!event_filter_match(event))
1471 continue; 1502 continue;
1472 1503
1473 if (group_can_go_on(event, cpuctx, can_add_hw)) { 1504 if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1580,10 +1611,6 @@ void __perf_event_task_sched_in(struct task_struct *task)
1580 } 1611 }
1581} 1612}
1582 1613
1583#define MAX_INTERRUPTS (~0ULL)
1584
1585static void perf_log_throttle(struct perf_event *event, int enable);
1586
1587static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 1614static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1588{ 1615{
1589 u64 frequency = event->attr.sample_freq; 1616 u64 frequency = event->attr.sample_freq;
@@ -1694,7 +1721,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1694 if (event->state != PERF_EVENT_STATE_ACTIVE) 1721 if (event->state != PERF_EVENT_STATE_ACTIVE)
1695 continue; 1722 continue;
1696 1723
1697 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1724 if (!event_filter_match(event))
1698 continue; 1725 continue;
1699 1726
1700 hwc = &event->hw; 1727 hwc = &event->hw;
@@ -1885,11 +1912,12 @@ static void __perf_event_read(void *info)
1885 return; 1912 return;
1886 1913
1887 raw_spin_lock(&ctx->lock); 1914 raw_spin_lock(&ctx->lock);
1888 update_context_time(ctx); 1915 if (ctx->is_active)
1916 update_context_time(ctx);
1889 update_event_times(event); 1917 update_event_times(event);
1918 if (event->state == PERF_EVENT_STATE_ACTIVE)
1919 event->pmu->read(event);
1890 raw_spin_unlock(&ctx->lock); 1920 raw_spin_unlock(&ctx->lock);
1891
1892 event->pmu->read(event);
1893} 1921}
1894 1922
1895static inline u64 perf_event_count(struct perf_event *event) 1923static inline u64 perf_event_count(struct perf_event *event)
@@ -1983,8 +2011,7 @@ static int alloc_callchain_buffers(void)
1983 * accessed from NMI. Use a temporary manual per cpu allocation 2011 * accessed from NMI. Use a temporary manual per cpu allocation
1984 * until that gets sorted out. 2012 * until that gets sorted out.
1985 */ 2013 */
1986 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * 2014 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
1987 num_possible_cpus();
1988 2015
1989 entries = kzalloc(size, GFP_KERNEL); 2016 entries = kzalloc(size, GFP_KERNEL);
1990 if (!entries) 2017 if (!entries)
@@ -2185,13 +2212,6 @@ find_lively_task_by_vpid(pid_t vpid)
2185 if (!task) 2212 if (!task)
2186 return ERR_PTR(-ESRCH); 2213 return ERR_PTR(-ESRCH);
2187 2214
2188 /*
2189 * Can't attach events to a dying task.
2190 */
2191 err = -ESRCH;
2192 if (task->flags & PF_EXITING)
2193 goto errout;
2194
2195 /* Reuse ptrace permission checks for now. */ 2215 /* Reuse ptrace permission checks for now. */
2196 err = -EACCES; 2216 err = -EACCES;
2197 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2217 if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2212,14 +2232,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2212 unsigned long flags; 2232 unsigned long flags;
2213 int ctxn, err; 2233 int ctxn, err;
2214 2234
2215 if (!task && cpu != -1) { 2235 if (!task) {
2216 /* Must be root to operate on a CPU event: */ 2236 /* Must be root to operate on a CPU event: */
2217 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2237 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2218 return ERR_PTR(-EACCES); 2238 return ERR_PTR(-EACCES);
2219 2239
2220 if (cpu < 0 || cpu >= nr_cpumask_bits)
2221 return ERR_PTR(-EINVAL);
2222
2223 /* 2240 /*
2224 * We could be clever and allow to attach a event to an 2241 * We could be clever and allow to attach a event to an
2225 * offline CPU and activate it when the CPU comes up, but 2242 * offline CPU and activate it when the CPU comes up, but
@@ -2255,14 +2272,27 @@ retry:
2255 2272
2256 get_ctx(ctx); 2273 get_ctx(ctx);
2257 2274
2258 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { 2275 err = 0;
2259 /* 2276 mutex_lock(&task->perf_event_mutex);
2260 * We raced with some other task; use 2277 /*
2261 * the context they set. 2278 * If it has already passed perf_event_exit_task().
2262 */ 2279 * we must see PF_EXITING, it takes this mutex too.
2280 */
2281 if (task->flags & PF_EXITING)
2282 err = -ESRCH;
2283 else if (task->perf_event_ctxp[ctxn])
2284 err = -EAGAIN;
2285 else
2286 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2287 mutex_unlock(&task->perf_event_mutex);
2288
2289 if (unlikely(err)) {
2263 put_task_struct(task); 2290 put_task_struct(task);
2264 kfree(ctx); 2291 kfree(ctx);
2265 goto retry; 2292
2293 if (err == -EAGAIN)
2294 goto retry;
2295 goto errout;
2266 } 2296 }
2267 } 2297 }
2268 2298
@@ -3893,7 +3923,7 @@ static int perf_event_task_match(struct perf_event *event)
3893 if (event->state < PERF_EVENT_STATE_INACTIVE) 3923 if (event->state < PERF_EVENT_STATE_INACTIVE)
3894 return 0; 3924 return 0;
3895 3925
3896 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3926 if (!event_filter_match(event))
3897 return 0; 3927 return 0;
3898 3928
3899 if (event->attr.comm || event->attr.mmap || 3929 if (event->attr.comm || event->attr.mmap ||
@@ -4030,7 +4060,7 @@ static int perf_event_comm_match(struct perf_event *event)
4030 if (event->state < PERF_EVENT_STATE_INACTIVE) 4060 if (event->state < PERF_EVENT_STATE_INACTIVE)
4031 return 0; 4061 return 0;
4032 4062
4033 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4063 if (!event_filter_match(event))
4034 return 0; 4064 return 0;
4035 4065
4036 if (event->attr.comm) 4066 if (event->attr.comm)
@@ -4178,7 +4208,7 @@ static int perf_event_mmap_match(struct perf_event *event,
4178 if (event->state < PERF_EVENT_STATE_INACTIVE) 4208 if (event->state < PERF_EVENT_STATE_INACTIVE)
4179 return 0; 4209 return 0;
4180 4210
4181 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4211 if (!event_filter_match(event))
4182 return 0; 4212 return 0;
4183 4213
4184 if ((!executable && event->attr.mmap_data) || 4214 if ((!executable && event->attr.mmap_data) ||
@@ -4648,7 +4678,7 @@ int perf_swevent_get_recursion_context(void)
4648} 4678}
4649EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4679EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4650 4680
4651void inline perf_swevent_put_recursion_context(int rctx) 4681inline void perf_swevent_put_recursion_context(int rctx)
4652{ 4682{
4653 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 4683 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4654 4684
@@ -5361,6 +5391,8 @@ free_dev:
5361 goto out; 5391 goto out;
5362} 5392}
5363 5393
5394static struct lock_class_key cpuctx_mutex;
5395
5364int perf_pmu_register(struct pmu *pmu, char *name, int type) 5396int perf_pmu_register(struct pmu *pmu, char *name, int type)
5365{ 5397{
5366 int cpu, ret; 5398 int cpu, ret;
@@ -5409,6 +5441,7 @@ skip_type:
5409 5441
5410 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5442 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5411 __perf_event_init_context(&cpuctx->ctx); 5443 __perf_event_init_context(&cpuctx->ctx);
5444 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5412 cpuctx->ctx.type = cpu_context; 5445 cpuctx->ctx.type = cpu_context;
5413 cpuctx->ctx.pmu = pmu; 5446 cpuctx->ctx.pmu = pmu;
5414 cpuctx->jiffies_interval = 1; 5447 cpuctx->jiffies_interval = 1;
@@ -5525,6 +5558,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5525 struct hw_perf_event *hwc; 5558 struct hw_perf_event *hwc;
5526 long err; 5559 long err;
5527 5560
5561 if ((unsigned)cpu >= nr_cpu_ids) {
5562 if (!task || cpu != -1)
5563 return ERR_PTR(-EINVAL);
5564 }
5565
5528 event = kzalloc(sizeof(*event), GFP_KERNEL); 5566 event = kzalloc(sizeof(*event), GFP_KERNEL);
5529 if (!event) 5567 if (!event)
5530 return ERR_PTR(-ENOMEM); 5568 return ERR_PTR(-ENOMEM);
@@ -5573,7 +5611,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5573 5611
5574 if (!overflow_handler && parent_event) 5612 if (!overflow_handler && parent_event)
5575 overflow_handler = parent_event->overflow_handler; 5613 overflow_handler = parent_event->overflow_handler;
5576 5614
5577 event->overflow_handler = overflow_handler; 5615 event->overflow_handler = overflow_handler;
5578 5616
5579 if (attr->disabled) 5617 if (attr->disabled)
@@ -6109,7 +6147,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6109 * scheduled, so we are now safe from rescheduling changing 6147 * scheduled, so we are now safe from rescheduling changing
6110 * our context. 6148 * our context.
6111 */ 6149 */
6112 child_ctx = child->perf_event_ctxp[ctxn]; 6150 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6113 task_ctx_sched_out(child_ctx, EVENT_ALL); 6151 task_ctx_sched_out(child_ctx, EVENT_ALL);
6114 6152
6115 /* 6153 /*
@@ -6422,11 +6460,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6422 unsigned long flags; 6460 unsigned long flags;
6423 int ret = 0; 6461 int ret = 0;
6424 6462
6425 child->perf_event_ctxp[ctxn] = NULL;
6426
6427 mutex_init(&child->perf_event_mutex);
6428 INIT_LIST_HEAD(&child->perf_event_list);
6429
6430 if (likely(!parent->perf_event_ctxp[ctxn])) 6463 if (likely(!parent->perf_event_ctxp[ctxn]))
6431 return 0; 6464 return 0;
6432 6465
@@ -6478,7 +6511,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6478 6511
6479 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 6512 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6480 parent_ctx->rotate_disable = 0; 6513 parent_ctx->rotate_disable = 0;
6481 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6482 6514
6483 child_ctx = child->perf_event_ctxp[ctxn]; 6515 child_ctx = child->perf_event_ctxp[ctxn];
6484 6516
@@ -6486,12 +6518,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6486 /* 6518 /*
6487 * Mark the child context as a clone of the parent 6519 * Mark the child context as a clone of the parent
6488 * context, or of whatever the parent is a clone of. 6520 * context, or of whatever the parent is a clone of.
6489 * Note that if the parent is a clone, it could get 6521 *
6490 * uncloned at any point, but that doesn't matter 6522 * Note that if the parent is a clone, the holding of
6491 * because the list of events and the generation 6523 * parent_ctx->lock avoids it from being uncloned.
6492 * count can't have changed since we took the mutex.
6493 */ 6524 */
6494 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 6525 cloned_ctx = parent_ctx->parent_ctx;
6495 if (cloned_ctx) { 6526 if (cloned_ctx) {
6496 child_ctx->parent_ctx = cloned_ctx; 6527 child_ctx->parent_ctx = cloned_ctx;
6497 child_ctx->parent_gen = parent_ctx->parent_gen; 6528 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6502,6 +6533,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6502 get_ctx(child_ctx->parent_ctx); 6533 get_ctx(child_ctx->parent_ctx);
6503 } 6534 }
6504 6535
6536 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6505 mutex_unlock(&parent_ctx->mutex); 6537 mutex_unlock(&parent_ctx->mutex);
6506 6538
6507 perf_unpin_context(parent_ctx); 6539 perf_unpin_context(parent_ctx);
@@ -6516,6 +6548,10 @@ int perf_event_init_task(struct task_struct *child)
6516{ 6548{
6517 int ctxn, ret; 6549 int ctxn, ret;
6518 6550
6551 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
6552 mutex_init(&child->perf_event_mutex);
6553 INIT_LIST_HEAD(&child->perf_event_list);
6554
6519 for_each_task_context_nr(ctxn) { 6555 for_each_task_context_nr(ctxn) {
6520 ret = perf_event_init_context(child, ctxn); 6556 ret = perf_event_init_context(child, ctxn);
6521 if (ret) 6557 if (ret)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
100 depends on PM_ADVANCED_DEBUG 100 depends on PM_ADVANCED_DEBUG
101 default n 101 default n
102 102
103config SUSPEND_NVS
104 bool
105
106config SUSPEND 103config SUSPEND
107 bool "Suspend to RAM and standby" 104 bool "Suspend to RAM and standby"
108 depends on PM && ARCH_SUSPEND_POSSIBLE 105 depends on PM && ARCH_SUSPEND_POSSIBLE
109 select SUSPEND_NVS if HAS_IOMEM
110 default y 106 default y
111 ---help--- 107 ---help---
112 Allow the system to enter sleep states in which main memory is 108 Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS 137 select LZO_COMPRESS
142 select LZO_DECOMPRESS 138 select LZO_DECOMPRESS
143 select SUSPEND_NVS if HAS_IOMEM
144 ---help--- 139 ---help---
145 Enable the suspend to disk (STD) functionality, which is usually 140 Enable the suspend to disk (STD) functionality, which is usually
146 called "hibernation" in user interfaces. STD checkpoints the 141 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
1 1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG
4endif
5 2
6obj-$(CONFIG_PM) += main.o 3obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 4obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 7obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 8obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 9 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 10
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
51 51
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 53
54static struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
55 55
56/** 56/**
57 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - set the global hibernate operations
58 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: the hibernation operations to use in subsequent hibernation transitions
59 */ 59 */
60 60
61void hibernation_set_ops(struct platform_hibernation_ops *ops) 61void hibernation_set_ops(const struct platform_hibernation_ops *ops)
62{ 62{
63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
65 && ops->restore_cleanup)) { 65 && ops->restore_cleanup && ops->leave)) {
66 WARN_ON(1); 66 WARN_ON(1);
67 return; 67 return;
68 } 68 }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
278 goto Enable_irqs; 278 goto Enable_irqs;
279 } 279 }
280 280
281 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) 281 if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
282 goto Power_up; 282 goto Power_up;
283 283
284 in_suspend = 1; 284 in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
516 516
517 local_irq_disable(); 517 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 518 sysdev_suspend(PMSG_HIBERNATE);
519 if (!pm_check_wakeup_events()) { 519 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 520 error = -EAGAIN;
521 goto Power_up; 521 goto Power_up;
522 } 522 }
@@ -647,6 +647,7 @@ int hibernate(void)
647 swsusp_free(); 647 swsusp_free();
648 if (!error) 648 if (!error)
649 power_down(); 649 power_down();
650 in_suspend = 0;
650 pm_restore_gfp_mask(); 651 pm_restore_gfp_mask();
651 } else { 652 } else {
652 pr_debug("PM: Image restored successfully.\n"); 653 pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..701853042c28 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
326 326
327static int __init pm_start_workqueue(void) 327static int __init pm_start_workqueue(void)
328{ 328{
329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); 329 pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
330 330
331 return pm_wq ? 0 : -ENOMEM; 331 return pm_wq ? 0 : -ENOMEM;
332} 332}
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/suspend.h>
15
16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that.
20 */
21
22struct nvs_page {
23 unsigned long phys_start;
24 unsigned int size;
25 void *kaddr;
26 void *data;
27 struct list_head node;
28};
29
30static LIST_HEAD(nvs_list);
31
32/**
33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region
35 * @size - size of the region
36 *
37 * The NVS region need not be page-aligned (both ends) and we arrange
38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages.
40 */
41int suspend_nvs_register(unsigned long start, unsigned long size)
42{
43 struct nvs_page *entry, *next;
44
45 while (size > 0) {
46 unsigned int nr_bytes;
47
48 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
49 if (!entry)
50 goto Error;
51
52 list_add_tail(&entry->node, &nvs_list);
53 entry->phys_start = start;
54 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
55 entry->size = (size < nr_bytes) ? size : nr_bytes;
56
57 start += entry->size;
58 size -= entry->size;
59 }
60 return 0;
61
62 Error:
63 list_for_each_entry_safe(entry, next, &nvs_list, node) {
64 list_del(&entry->node);
65 kfree(entry);
66 }
67 return -ENOMEM;
68}
69
70/**
71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */
73void suspend_nvs_free(void)
74{
75 struct nvs_page *entry;
76
77 list_for_each_entry(entry, &nvs_list, node)
78 if (entry->data) {
79 free_page((unsigned long)entry->data);
80 entry->data = NULL;
81 if (entry->kaddr) {
82 iounmap(entry->kaddr);
83 entry->kaddr = NULL;
84 }
85 }
86}
87
88/**
89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */
91int suspend_nvs_alloc(void)
92{
93 struct nvs_page *entry;
94
95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) {
98 suspend_nvs_free();
99 return -ENOMEM;
100 }
101 }
102 return 0;
103}
104
105/**
106 * suspend_nvs_save - save NVS memory regions
107 */
108void suspend_nvs_save(void)
109{
110 struct nvs_page *entry;
111
112 printk(KERN_INFO "PM: Saving platform NVS memory\n");
113
114 list_for_each_entry(entry, &nvs_list, node)
115 if (entry->data) {
116 entry->kaddr = ioremap(entry->phys_start, entry->size);
117 memcpy(entry->data, entry->kaddr, entry->size);
118 }
119}
120
121/**
122 * suspend_nvs_restore - restore NVS memory regions
123 *
124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */
127void suspend_nvs_restore(void)
128{
129 struct nvs_page *entry;
130
131 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
132
133 list_for_each_entry(entry, &nvs_list, node)
134 if (entry->data)
135 memcpy(entry->kaddr, entry->data, entry->size);
136}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezeable(struct task_struct * p) 25static inline int freezable(struct task_struct * p)
26{ 26{
27 if ((p == current) || 27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) || 28 (p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
53 todo = 0; 53 todo = 0;
54 read_lock(&tasklist_lock); 54 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 55 do_each_thread(g, p) {
56 if (frozen(p) || !freezeable(p)) 56 if (frozen(p) || !freezable(p))
57 continue; 57 continue;
58 58
59 if (!freeze_task(p, sig_only)) 59 if (!freeze_task(p, sig_only))
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
64 * perturb a task in TASK_STOPPED or TASK_TRACED. 64 * perturb a task in TASK_STOPPED or TASK_TRACED.
65 * It is "frozen enough". If the task does wake 65 * It is "frozen enough". If the task does wake
66 * up, it will immediately call try_to_freeze. 66 * up, it will immediately call try_to_freeze.
67 *
68 * Because freeze_task() goes through p's
69 * scheduler lock after setting TIF_FREEZE, it's
70 * guaranteed that either we see TASK_RUNNING or
71 * try_to_stop() after schedule() in ptrace/signal
72 * stop sees TIF_FREEZE.
67 */ 73 */
68 if (!task_is_stopped_or_traced(p) && 74 if (!task_is_stopped_or_traced(p) &&
69 !freezer_should_skip(p)) 75 !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
79 if (!todo || time_after(jiffies, end_time)) 85 if (!todo || time_after(jiffies, end_time))
80 break; 86 break;
81 87
82 if (!pm_check_wakeup_events()) { 88 if (pm_wakeup_pending()) {
83 wakeup = true; 89 wakeup = true;
84 break; 90 break;
85 } 91 }
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
161 167
162 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
163 do_each_thread(g, p) { 169 do_each_thread(g, p) {
164 if (!freezeable(p)) 170 if (!freezable(p))
165 continue; 171 continue;
166 172
167 if (nosig_only && should_send_signal(p)) 173 if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..64db648ff911 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1519,11 +1519,8 @@ static int
1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1520 unsigned int nr_pages, unsigned int nr_highmem) 1520 unsigned int nr_pages, unsigned int nr_highmem)
1521{ 1521{
1522 int error = 0;
1523
1524 if (nr_highmem > 0) { 1522 if (nr_highmem > 0) {
1525 error = get_highmem_buffer(PG_ANY); 1523 if (get_highmem_buffer(PG_ANY))
1526 if (error)
1527 goto err_out; 1524 goto err_out;
1528 if (nr_highmem > alloc_highmem) { 1525 if (nr_highmem > alloc_highmem) {
1529 nr_highmem -= alloc_highmem; 1526 nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1546 1543
1547 err_out: 1544 err_out:
1548 swsusp_free(); 1545 swsusp_free();
1549 return error; 1546 return -ENOMEM;
1550} 1547}
1551 1548
1552asmlinkage int swsusp_save(void) 1549asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 031d5e3a6197..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
31 [PM_SUSPEND_MEM] = "mem", 31 [PM_SUSPEND_MEM] = "mem",
32}; 32};
33 33
34static struct platform_suspend_ops *suspend_ops; 34static const struct platform_suspend_ops *suspend_ops;
35 35
36/** 36/**
37 * suspend_set_ops - Set the global suspend method table. 37 * suspend_set_ops - Set the global suspend method table.
38 * @ops: Pointer to ops structure. 38 * @ops: Pointer to ops structure.
39 */ 39 */
40void suspend_set_ops(struct platform_suspend_ops *ops) 40void suspend_set_ops(const struct platform_suspend_ops *ops)
41{ 41{
42 mutex_lock(&pm_mutex); 42 mutex_lock(&pm_mutex);
43 suspend_ops = ops; 43 suspend_ops = ops;
@@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
164 164
165 error = sysdev_suspend(PMSG_SUSPEND); 165 error = sysdev_suspend(PMSG_SUSPEND);
166 if (!error) { 166 if (!error) {
167 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { 167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
168 error = suspend_ops->enter(state); 168 error = suspend_ops->enter(state);
169 events_check_enabled = false; 169 events_check_enabled = false;
170 } 170 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c7e4832b9be..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
224 return res; 224 return res;
225 225
226 root_swap = res; 226 root_swap = res;
227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE); 227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
228 if (res) 228 if (res)
229 return res; 229 return res;
230 230
@@ -888,7 +888,7 @@ out_finish:
888/** 888/**
889 * swsusp_read - read the hibernation image. 889 * swsusp_read - read the hibernation image.
890 * @flags_p: flags passed by the "frozen" kernel in the image header should 890 * @flags_p: flags passed by the "frozen" kernel in the image header should
891 * be written into this memeory location 891 * be written into this memory location
892 */ 892 */
893 893
894int swsusp_read(unsigned int *flags_p) 894int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
930{ 930{
931 int error; 931 int error;
932 932
933 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
934 FMODE_READ, NULL);
934 if (!IS_ERR(hib_resume_bdev)) { 935 if (!IS_ERR(hib_resume_bdev)) {
935 set_blocksize(hib_resume_bdev, PAGE_SIZE); 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
936 clear_page(swsusp_header); 937 clear_page(swsusp_header);
diff --git a/kernel/printk.c b/kernel/printk.c
index f64b8997fc76..36231525e22f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,6 +39,7 @@
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rculist.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
@@ -96,7 +97,7 @@ static int console_locked, console_suspended;
96/* 97/*
97 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars 98 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
98 * It is also used in interesting ways to provide interlocking in 99 * It is also used in interesting ways to provide interlocking in
99 * release_console_sem(). 100 * console_unlock();.
100 */ 101 */
101static DEFINE_SPINLOCK(logbuf_lock); 102static DEFINE_SPINLOCK(logbuf_lock);
102 103
@@ -261,25 +262,47 @@ int dmesg_restrict = 1;
261int dmesg_restrict; 262int dmesg_restrict;
262#endif 263#endif
263 264
265static int syslog_action_restricted(int type)
266{
267 if (dmesg_restrict)
268 return 1;
269 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
270 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
271}
272
273static int check_syslog_permissions(int type, bool from_file)
274{
275 /*
276 * If this is from /proc/kmsg and we've already opened it, then we've
277 * already done the capabilities checks at open time.
278 */
279 if (from_file && type != SYSLOG_ACTION_OPEN)
280 return 0;
281
282 if (syslog_action_restricted(type)) {
283 if (capable(CAP_SYSLOG))
284 return 0;
285 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
286 if (capable(CAP_SYS_ADMIN)) {
287 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
288 "but no CAP_SYSLOG (deprecated).\n");
289 return 0;
290 }
291 return -EPERM;
292 }
293 return 0;
294}
295
264int do_syslog(int type, char __user *buf, int len, bool from_file) 296int do_syslog(int type, char __user *buf, int len, bool from_file)
265{ 297{
266 unsigned i, j, limit, count; 298 unsigned i, j, limit, count;
267 int do_clear = 0; 299 int do_clear = 0;
268 char c; 300 char c;
269 int error = 0; 301 int error;
270 302
271 /* 303 error = check_syslog_permissions(type, from_file);
272 * If this is from /proc/kmsg we only do the capabilities checks 304 if (error)
273 * at open time. 305 goto out;
274 */
275 if (type == SYSLOG_ACTION_OPEN || !from_file) {
276 if (dmesg_restrict && !capable(CAP_SYSLOG))
277 goto warn; /* switch to return -EPERM after 2.6.39 */
278 if ((type != SYSLOG_ACTION_READ_ALL &&
279 type != SYSLOG_ACTION_SIZE_BUFFER) &&
280 !capable(CAP_SYSLOG))
281 goto warn; /* switch to return -EPERM after 2.6.39 */
282 }
283 306
284 error = security_syslog(type); 307 error = security_syslog(type);
285 if (error) 308 if (error)
@@ -422,12 +445,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
422 } 445 }
423out: 446out:
424 return error; 447 return error;
425warn:
426 /* remove after 2.6.39 */
427 if (capable(CAP_SYS_ADMIN))
428 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
429 "but no CAP_SYSLOG (deprecated and denied).\n");
430 return -EPERM;
431} 448}
432 449
433SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 450SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -500,7 +517,7 @@ static void _call_console_drivers(unsigned start,
500/* 517/*
501 * Call the console drivers, asking them to write out 518 * Call the console drivers, asking them to write out
502 * log_buf[start] to log_buf[end - 1]. 519 * log_buf[start] to log_buf[end - 1].
503 * The console_sem must be held. 520 * The console_lock must be held.
504 */ 521 */
505static void call_console_drivers(unsigned start, unsigned end) 522static void call_console_drivers(unsigned start, unsigned end)
506{ 523{
@@ -603,11 +620,11 @@ static int have_callable_console(void)
603 * 620 *
604 * This is printk(). It can be called from any context. We want it to work. 621 * This is printk(). It can be called from any context. We want it to work.
605 * 622 *
606 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 623 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
607 * call the console drivers. If we fail to get the semaphore we place the output 624 * call the console drivers. If we fail to get the semaphore we place the output
608 * into the log buffer and return. The current holder of the console_sem will 625 * into the log buffer and return. The current holder of the console_sem will
609 * notice the new output in release_console_sem() and will send it to the 626 * notice the new output in console_unlock(); and will send it to the
610 * consoles before releasing the semaphore. 627 * consoles before releasing the lock.
611 * 628 *
612 * One effect of this deferred printing is that code which calls printk() and 629 * One effect of this deferred printing is that code which calls printk() and
613 * then changes console_loglevel may break. This is because console_loglevel 630 * then changes console_loglevel may break. This is because console_loglevel
@@ -658,19 +675,19 @@ static inline int can_use_console(unsigned int cpu)
658/* 675/*
659 * Try to get console ownership to actually show the kernel 676 * Try to get console ownership to actually show the kernel
660 * messages from a 'printk'. Return true (and with the 677 * messages from a 'printk'. Return true (and with the
661 * console_semaphore held, and 'console_locked' set) if it 678 * console_lock held, and 'console_locked' set) if it
662 * is successful, false otherwise. 679 * is successful, false otherwise.
663 * 680 *
664 * This gets called with the 'logbuf_lock' spinlock held and 681 * This gets called with the 'logbuf_lock' spinlock held and
665 * interrupts disabled. It should return with 'lockbuf_lock' 682 * interrupts disabled. It should return with 'lockbuf_lock'
666 * released but interrupts still disabled. 683 * released but interrupts still disabled.
667 */ 684 */
668static int acquire_console_semaphore_for_printk(unsigned int cpu) 685static int console_trylock_for_printk(unsigned int cpu)
669 __releases(&logbuf_lock) 686 __releases(&logbuf_lock)
670{ 687{
671 int retval = 0; 688 int retval = 0;
672 689
673 if (!try_acquire_console_sem()) { 690 if (console_trylock()) {
674 retval = 1; 691 retval = 1;
675 692
676 /* 693 /*
@@ -826,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
826 * actual magic (print out buffers, wake up klogd, 843 * actual magic (print out buffers, wake up klogd,
827 * etc). 844 * etc).
828 * 845 *
829 * The acquire_console_semaphore_for_printk() function 846 * The console_trylock_for_printk() function
830 * will release 'logbuf_lock' regardless of whether it 847 * will release 'logbuf_lock' regardless of whether it
831 * actually gets the semaphore or not. 848 * actually gets the semaphore or not.
832 */ 849 */
833 if (acquire_console_semaphore_for_printk(this_cpu)) 850 if (console_trylock_for_printk(this_cpu))
834 release_console_sem(); 851 console_unlock();
835 852
836 lockdep_on(); 853 lockdep_on();
837out_restore_irqs: 854out_restore_irqs:
@@ -992,7 +1009,7 @@ void suspend_console(void)
992 if (!console_suspend_enabled) 1009 if (!console_suspend_enabled)
993 return; 1010 return;
994 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1011 printk("Suspending console(s) (use no_console_suspend to debug)\n");
995 acquire_console_sem(); 1012 console_lock();
996 console_suspended = 1; 1013 console_suspended = 1;
997 up(&console_sem); 1014 up(&console_sem);
998} 1015}
@@ -1003,7 +1020,7 @@ void resume_console(void)
1003 return; 1020 return;
1004 down(&console_sem); 1021 down(&console_sem);
1005 console_suspended = 0; 1022 console_suspended = 0;
1006 release_console_sem(); 1023 console_unlock();
1007} 1024}
1008 1025
1009/** 1026/**
@@ -1026,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1026 case CPU_DYING: 1043 case CPU_DYING:
1027 case CPU_DOWN_FAILED: 1044 case CPU_DOWN_FAILED:
1028 case CPU_UP_CANCELED: 1045 case CPU_UP_CANCELED:
1029 acquire_console_sem(); 1046 console_lock();
1030 release_console_sem(); 1047 console_unlock();
1031 } 1048 }
1032 return NOTIFY_OK; 1049 return NOTIFY_OK;
1033} 1050}
1034 1051
1035/** 1052/**
1036 * acquire_console_sem - lock the console system for exclusive use. 1053 * console_lock - lock the console system for exclusive use.
1037 * 1054 *
1038 * Acquires a semaphore which guarantees that the caller has 1055 * Acquires a lock which guarantees that the caller has
1039 * exclusive access to the console system and the console_drivers list. 1056 * exclusive access to the console system and the console_drivers list.
1040 * 1057 *
1041 * Can sleep, returns nothing. 1058 * Can sleep, returns nothing.
1042 */ 1059 */
1043void acquire_console_sem(void) 1060void console_lock(void)
1044{ 1061{
1045 BUG_ON(in_interrupt()); 1062 BUG_ON(in_interrupt());
1046 down(&console_sem); 1063 down(&console_sem);
@@ -1049,21 +1066,29 @@ void acquire_console_sem(void)
1049 console_locked = 1; 1066 console_locked = 1;
1050 console_may_schedule = 1; 1067 console_may_schedule = 1;
1051} 1068}
1052EXPORT_SYMBOL(acquire_console_sem); 1069EXPORT_SYMBOL(console_lock);
1053 1070
1054int try_acquire_console_sem(void) 1071/**
1072 * console_trylock - try to lock the console system for exclusive use.
1073 *
1074 * Tried to acquire a lock which guarantees that the caller has
1075 * exclusive access to the console system and the console_drivers list.
1076 *
1077 * returns 1 on success, and 0 on failure to acquire the lock.
1078 */
1079int console_trylock(void)
1055{ 1080{
1056 if (down_trylock(&console_sem)) 1081 if (down_trylock(&console_sem))
1057 return -1; 1082 return 0;
1058 if (console_suspended) { 1083 if (console_suspended) {
1059 up(&console_sem); 1084 up(&console_sem);
1060 return -1; 1085 return 0;
1061 } 1086 }
1062 console_locked = 1; 1087 console_locked = 1;
1063 console_may_schedule = 0; 1088 console_may_schedule = 0;
1064 return 0; 1089 return 1;
1065} 1090}
1066EXPORT_SYMBOL(try_acquire_console_sem); 1091EXPORT_SYMBOL(console_trylock);
1067 1092
1068int is_console_locked(void) 1093int is_console_locked(void)
1069{ 1094{
@@ -1094,20 +1119,20 @@ void wake_up_klogd(void)
1094} 1119}
1095 1120
1096/** 1121/**
1097 * release_console_sem - unlock the console system 1122 * console_unlock - unlock the console system
1098 * 1123 *
1099 * Releases the semaphore which the caller holds on the console system 1124 * Releases the console_lock which the caller holds on the console system
1100 * and the console driver list. 1125 * and the console driver list.
1101 * 1126 *
1102 * While the semaphore was held, console output may have been buffered 1127 * While the console_lock was held, console output may have been buffered
1103 * by printk(). If this is the case, release_console_sem() emits 1128 * by printk(). If this is the case, console_unlock(); emits
1104 * the output prior to releasing the semaphore. 1129 * the output prior to releasing the lock.
1105 * 1130 *
1106 * If there is output waiting for klogd, we wake it up. 1131 * If there is output waiting for klogd, we wake it up.
1107 * 1132 *
1108 * release_console_sem() may be called from any context. 1133 * console_unlock(); may be called from any context.
1109 */ 1134 */
1110void release_console_sem(void) 1135void console_unlock(void)
1111{ 1136{
1112 unsigned long flags; 1137 unsigned long flags;
1113 unsigned _con_start, _log_end; 1138 unsigned _con_start, _log_end;
@@ -1140,7 +1165,7 @@ void release_console_sem(void)
1140 if (wake_klogd) 1165 if (wake_klogd)
1141 wake_up_klogd(); 1166 wake_up_klogd();
1142} 1167}
1143EXPORT_SYMBOL(release_console_sem); 1168EXPORT_SYMBOL(console_unlock);
1144 1169
1145/** 1170/**
1146 * console_conditional_schedule - yield the CPU if required 1171 * console_conditional_schedule - yield the CPU if required
@@ -1149,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem);
1149 * if this CPU should yield the CPU to another task, do 1174 * if this CPU should yield the CPU to another task, do
1150 * so here. 1175 * so here.
1151 * 1176 *
1152 * Must be called within acquire_console_sem(). 1177 * Must be called within console_lock();.
1153 */ 1178 */
1154void __sched console_conditional_schedule(void) 1179void __sched console_conditional_schedule(void)
1155{ 1180{
@@ -1170,14 +1195,14 @@ void console_unblank(void)
1170 if (down_trylock(&console_sem) != 0) 1195 if (down_trylock(&console_sem) != 0)
1171 return; 1196 return;
1172 } else 1197 } else
1173 acquire_console_sem(); 1198 console_lock();
1174 1199
1175 console_locked = 1; 1200 console_locked = 1;
1176 console_may_schedule = 0; 1201 console_may_schedule = 0;
1177 for_each_console(c) 1202 for_each_console(c)
1178 if ((c->flags & CON_ENABLED) && c->unblank) 1203 if ((c->flags & CON_ENABLED) && c->unblank)
1179 c->unblank(); 1204 c->unblank();
1180 release_console_sem(); 1205 console_unlock();
1181} 1206}
1182 1207
1183/* 1208/*
@@ -1188,7 +1213,7 @@ struct tty_driver *console_device(int *index)
1188 struct console *c; 1213 struct console *c;
1189 struct tty_driver *driver = NULL; 1214 struct tty_driver *driver = NULL;
1190 1215
1191 acquire_console_sem(); 1216 console_lock();
1192 for_each_console(c) { 1217 for_each_console(c) {
1193 if (!c->device) 1218 if (!c->device)
1194 continue; 1219 continue;
@@ -1196,7 +1221,7 @@ struct tty_driver *console_device(int *index)
1196 if (driver) 1221 if (driver)
1197 break; 1222 break;
1198 } 1223 }
1199 release_console_sem(); 1224 console_unlock();
1200 return driver; 1225 return driver;
1201} 1226}
1202 1227
@@ -1207,17 +1232,17 @@ struct tty_driver *console_device(int *index)
1207 */ 1232 */
1208void console_stop(struct console *console) 1233void console_stop(struct console *console)
1209{ 1234{
1210 acquire_console_sem(); 1235 console_lock();
1211 console->flags &= ~CON_ENABLED; 1236 console->flags &= ~CON_ENABLED;
1212 release_console_sem(); 1237 console_unlock();
1213} 1238}
1214EXPORT_SYMBOL(console_stop); 1239EXPORT_SYMBOL(console_stop);
1215 1240
1216void console_start(struct console *console) 1241void console_start(struct console *console)
1217{ 1242{
1218 acquire_console_sem(); 1243 console_lock();
1219 console->flags |= CON_ENABLED; 1244 console->flags |= CON_ENABLED;
1220 release_console_sem(); 1245 console_unlock();
1221} 1246}
1222EXPORT_SYMBOL(console_start); 1247EXPORT_SYMBOL(console_start);
1223 1248
@@ -1339,7 +1364,7 @@ void register_console(struct console *newcon)
1339 * Put this console in the list - keep the 1364 * Put this console in the list - keep the
1340 * preferred driver at the head of the list. 1365 * preferred driver at the head of the list.
1341 */ 1366 */
1342 acquire_console_sem(); 1367 console_lock();
1343 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { 1368 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1344 newcon->next = console_drivers; 1369 newcon->next = console_drivers;
1345 console_drivers = newcon; 1370 console_drivers = newcon;
@@ -1351,14 +1376,14 @@ void register_console(struct console *newcon)
1351 } 1376 }
1352 if (newcon->flags & CON_PRINTBUFFER) { 1377 if (newcon->flags & CON_PRINTBUFFER) {
1353 /* 1378 /*
1354 * release_console_sem() will print out the buffered messages 1379 * console_unlock(); will print out the buffered messages
1355 * for us. 1380 * for us.
1356 */ 1381 */
1357 spin_lock_irqsave(&logbuf_lock, flags); 1382 spin_lock_irqsave(&logbuf_lock, flags);
1358 con_start = log_start; 1383 con_start = log_start;
1359 spin_unlock_irqrestore(&logbuf_lock, flags); 1384 spin_unlock_irqrestore(&logbuf_lock, flags);
1360 } 1385 }
1361 release_console_sem(); 1386 console_unlock();
1362 console_sysfs_notify(); 1387 console_sysfs_notify();
1363 1388
1364 /* 1389 /*
@@ -1395,7 +1420,7 @@ int unregister_console(struct console *console)
1395 return braille_unregister_console(console); 1420 return braille_unregister_console(console);
1396#endif 1421#endif
1397 1422
1398 acquire_console_sem(); 1423 console_lock();
1399 if (console_drivers == console) { 1424 if (console_drivers == console) {
1400 console_drivers=console->next; 1425 console_drivers=console->next;
1401 res = 0; 1426 res = 0;
@@ -1417,7 +1442,7 @@ int unregister_console(struct console *console)
1417 if (console_drivers != NULL && console->flags & CON_CONSDEV) 1442 if (console_drivers != NULL && console->flags & CON_CONSDEV)
1418 console_drivers->flags |= CON_CONSDEV; 1443 console_drivers->flags |= CON_CONSDEV;
1419 1444
1420 release_console_sem(); 1445 console_unlock();
1421 console_sysfs_notify(); 1446 console_sysfs_notify();
1422 return res; 1447 return res;
1423} 1448}
@@ -1502,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
1502 /* Don't allow registering multiple times */ 1527 /* Don't allow registering multiple times */
1503 if (!dumper->registered) { 1528 if (!dumper->registered) {
1504 dumper->registered = 1; 1529 dumper->registered = 1;
1505 list_add_tail(&dumper->list, &dump_list); 1530 list_add_tail_rcu(&dumper->list, &dump_list);
1506 err = 0; 1531 err = 0;
1507 } 1532 }
1508 spin_unlock_irqrestore(&dump_list_lock, flags); 1533 spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1526,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1526 spin_lock_irqsave(&dump_list_lock, flags); 1551 spin_lock_irqsave(&dump_list_lock, flags);
1527 if (dumper->registered) { 1552 if (dumper->registered) {
1528 dumper->registered = 0; 1553 dumper->registered = 0;
1529 list_del(&dumper->list); 1554 list_del_rcu(&dumper->list);
1530 err = 0; 1555 err = 0;
1531 } 1556 }
1532 spin_unlock_irqrestore(&dump_list_lock, flags); 1557 spin_unlock_irqrestore(&dump_list_lock, flags);
1558 synchronize_rcu();
1533 1559
1534 return err; 1560 return err;
1535} 1561}
1536EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1562EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1537 1563
1538static const char * const kmsg_reasons[] = {
1539 [KMSG_DUMP_OOPS] = "oops",
1540 [KMSG_DUMP_PANIC] = "panic",
1541 [KMSG_DUMP_KEXEC] = "kexec",
1542};
1543
1544static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1545{
1546 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1547 return "unknown";
1548
1549 return kmsg_reasons[reason];
1550}
1551
1552/** 1564/**
1553 * kmsg_dump - dump kernel log to kernel message dumpers. 1565 * kmsg_dump - dump kernel log to kernel message dumpers.
1554 * @reason: the reason (oops, panic etc) for dumping 1566 * @reason: the reason (oops, panic etc) for dumping
@@ -1587,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1587 l2 = chars; 1599 l2 = chars;
1588 } 1600 }
1589 1601
1590 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1602 rcu_read_lock();
1591 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", 1603 list_for_each_entry_rcu(dumper, &dump_list, list)
1592 kmsg_to_str(reason));
1593 return;
1594 }
1595 list_for_each_entry(dumper, &dump_list, list)
1596 dumper->dump(dumper, reason, s1, l1, s2, l2); 1604 dumper->dump(dumper, reason, s1, l1, s2, l2);
1597 spin_unlock_irqrestore(&dump_list_lock, flags); 1605 rcu_read_unlock();
1598} 1606}
1599#endif 1607#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..e2302e40b360 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
163 return !err; 163 return !err;
164} 164}
165 165
166int ptrace_attach(struct task_struct *task) 166static int ptrace_attach(struct task_struct *task)
167{ 167{
168 int retval; 168 int retval;
169 169
@@ -219,7 +219,7 @@ out:
219 * Performs checks and sets PT_PTRACED. 219 * Performs checks and sets PT_PTRACED.
220 * Should be used by all ptrace implementations for PTRACE_TRACEME. 220 * Should be used by all ptrace implementations for PTRACE_TRACEME.
221 */ 221 */
222int ptrace_traceme(void) 222static int ptrace_traceme(void)
223{ 223{
224 int ret = -EPERM; 224 int ret = -EPERM;
225 225
@@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
293 return false; 293 return false;
294} 294}
295 295
296int ptrace_detach(struct task_struct *child, unsigned int data) 296static int ptrace_detach(struct task_struct *child, unsigned int data)
297{ 297{
298 bool dead = false; 298 bool dead = false;
299 299
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
313 child->exit_code = data; 313 child->exit_code = data;
314 dead = __ptrace_detach(current, child); 314 dead = __ptrace_detach(current, child);
315 if (!child->exit_state) 315 if (!child->exit_state)
316 wake_up_process(child); 316 wake_up_state(child, TASK_TRACED | TASK_STOPPED);
317 } 317 }
318 write_unlock_irq(&tasklist_lock); 318 write_unlock_irq(&tasklist_lock);
319 319
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 034493724749..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
189 unsigned long flags; 189 unsigned long flags;
190 190
191 for (;;) { 191 for (;;) {
192 wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); 192 wait_event_interruptible(rcu_kthread_wq,
193 have_rcu_kthread_work != 0);
193 morework = rcu_boost(); 194 morework = rcu_boost();
194 local_irq_save(flags); 195 local_irq_save(flags);
195 work = have_rcu_kthread_work; 196 work = have_rcu_kthread_work;
diff --git a/kernel/sched.c b/kernel/sched.c
index a0eb0941fa84..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
553 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
554 unsigned int ttwu_count; 554 unsigned int ttwu_count;
555 unsigned int ttwu_local; 555 unsigned int ttwu_local;
556
557 /* BKL stats */
558 unsigned int bkl_count;
559#endif 556#endif
560}; 557};
561 558
@@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
609 struct task_group *tg; 606 struct task_group *tg;
610 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
611 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
@@ -2505,7 +2505,7 @@ out:
2505 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2506 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2507 * 2507 *
2508 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2510 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2511 */ 2511 */
@@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
3887 schedstat_inc(this_rq(), sched_count); 3887 schedstat_inc(this_rq(), sched_count);
3888#ifdef CONFIG_SCHEDSTATS 3888#ifdef CONFIG_SCHEDSTATS
3889 if (unlikely(prev->lock_depth >= 0)) { 3889 if (unlikely(prev->lock_depth >= 0)) {
3890 schedstat_inc(this_rq(), bkl_count); 3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3891 schedstat_inc(prev, sched_info.bkl_count); 3891 schedstat_inc(prev, sched_info.bkl_count);
3892 } 3892 }
3893#endif 3893#endif
@@ -4871,7 +4871,8 @@ recheck:
4871 * assigned. 4871 * assigned.
4872 */ 4872 */
4873 if (rt_bandwidth_enabled() && rt_policy(policy) && 4873 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4874 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) {
4875 __task_rq_unlock(rq); 4876 __task_rq_unlock(rq);
4876 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4877 return -EPERM; 4878 return -EPERM;
@@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8882 } 8883 }
8883} 8884}
8884 8885
8886static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8888{
8889 /*
8890 * cgroup_exit() is called in the copy_process() failure path.
8891 * Ignore this case since the task hasn't ran yet, this avoids
8892 * trying to poke a half freed task state from generic code.
8893 */
8894 if (!(task->flags & PF_EXITING))
8895 return;
8896
8897 sched_move_task(task);
8898}
8899
8885#ifdef CONFIG_FAIR_GROUP_SCHED 8900#ifdef CONFIG_FAIR_GROUP_SCHED
8886static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8887 u64 shareval) 8902 u64 shareval)
@@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8954 .destroy = cpu_cgroup_destroy, 8969 .destroy = cpu_cgroup_destroy,
8955 .can_attach = cpu_cgroup_can_attach, 8970 .can_attach = cpu_cgroup_can_attach,
8956 .attach = cpu_cgroup_attach, 8971 .attach = cpu_cgroup_attach,
8972 .exit = cpu_cgroup_exit,
8957 .populate = cpu_cgroup_populate, 8973 .populate = cpu_cgroup_populate,
8958 .subsys_id = cpu_cgroup_subsys_id, 8974 .subsys_id = cpu_cgroup_subsys_id,
8959 .early_init = 1, 8975 .early_init = 1,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b8f84c..9fb656283157 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
27{ 27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref); 28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29 29
30#ifdef CONFIG_RT_GROUP_SCHED
31 /* We've redirected RT tasks to the root task group... */
32 ag->tg->rt_se = NULL;
33 ag->tg->rt_rq = NULL;
34#endif
30 sched_destroy_group(ag->tg); 35 sched_destroy_group(ag->tg);
31} 36}
32 37
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
55 return ag; 60 return ag;
56} 61}
57 62
63#ifdef CONFIG_RT_GROUP_SCHED
64static void free_rt_sched_group(struct task_group *tg);
65#endif
66
58static inline struct autogroup *autogroup_create(void) 67static inline struct autogroup *autogroup_create(void)
59{ 68{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 69 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
72 init_rwsem(&ag->lock); 81 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr); 82 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg; 83 ag->tg = tg;
84#ifdef CONFIG_RT_GROUP_SCHED
85 /*
86 * Autogroup RT tasks are redirected to the root task group
87 * so we don't have to move tasks around upon policy change,
88 * or flail around trying to allocate bandwidth on the fly.
89 * A bandwidth exception in __sched_setscheduler() allows
90 * the policy change to proceed. Thereafter, task_group()
91 * returns &root_task_group, so zero bandwidth is required.
92 */
93 free_rt_sched_group(tg);
94 tg->rt_se = root_task_group.rt_se;
95 tg->rt_rq = root_task_group.rt_rq;
96#endif
75 tg->autogroup = ag; 97 tg->autogroup = ag;
76 98
77 return ag; 99 return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
106 return true; 128 return true;
107} 129}
108 130
131static inline bool task_group_is_autogroup(struct task_group *tg)
132{
133 return tg != &root_task_group && tg->autogroup;
134}
135
109static inline struct task_group * 136static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg) 137autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{ 138{
@@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
231#ifdef CONFIG_SCHED_DEBUG 258#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{ 260{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
262
263 if (!enabled || !tg->autogroup)
264 return 0;
265
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235} 267}
236#endif /* CONFIG_SCHED_DEBUG */ 268#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb20..7b859ffe5dad 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
15 15
16static inline void autogroup_init(struct task_struct *init_task) { } 16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { } 17static inline void autogroup_free(struct task_group *tg) { }
18static inline bool task_group_is_autogroup(struct task_group *tg)
19{
20 return 0;
21}
18 22
19static inline struct task_group * 23static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg) 24autogroup_task_group(struct task_struct *p, struct task_group *tg)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b5..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19static DEFINE_SPINLOCK(sched_debug_lock);
20
19/* 21/*
20 * This allows printing both to /proc/sched_debug and 22 * This allows printing both to /proc/sched_debug and
21 * to the console 23 * to the console
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
86} 88}
87#endif 89#endif
88 90
91#ifdef CONFIG_CGROUP_SCHED
92static char group_path[PATH_MAX];
93
94static char *task_group_path(struct task_group *tg)
95{
96 if (autogroup_path(tg, group_path, PATH_MAX))
97 return group_path;
98
99 /*
100 * May be NULL if the underlying cgroup isn't fully-created yet
101 */
102 if (!tg->css.cgroup) {
103 group_path[0] = '\0';
104 return group_path;
105 }
106 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
107 return group_path;
108}
109#endif
110
89static void 111static void
90print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 112print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
91{ 113{
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
108 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 130 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 131 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
110#endif 132#endif
133#ifdef CONFIG_CGROUP_SCHED
134 SEQ_printf(m, " %s", task_group_path(task_group(p)));
135#endif
111 136
112 SEQ_printf(m, "\n"); 137 SEQ_printf(m, "\n");
113} 138}
@@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
144 struct sched_entity *last; 169 struct sched_entity *last;
145 unsigned long flags; 170 unsigned long flags;
146 171
172#ifdef CONFIG_FAIR_GROUP_SCHED
173 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
174#else
147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 175 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
176#endif
148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 177 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
149 SPLIT_NS(cfs_rq->exec_clock)); 178 SPLIT_NS(cfs_rq->exec_clock));
150 179
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 220
192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 221void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{ 222{
223#ifdef CONFIG_RT_GROUP_SCHED
224 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
225#else
194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 226 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
227#endif
195 228
196#define P(x) \ 229#define P(x) \
197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 230 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running;
212static void print_cpu(struct seq_file *m, int cpu) 245static void print_cpu(struct seq_file *m, int cpu)
213{ 246{
214 struct rq *rq = cpu_rq(cpu); 247 struct rq *rq = cpu_rq(cpu);
248 unsigned long flags;
215 249
216#ifdef CONFIG_X86 250#ifdef CONFIG_X86
217 { 251 {
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
262 P(ttwu_count); 296 P(ttwu_count);
263 P(ttwu_local); 297 P(ttwu_local);
264 298
265 P(bkl_count); 299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
266 301
267#undef P 302#undef P
303#undef P64
268#endif 304#endif
305 spin_lock_irqsave(&sched_debug_lock, flags);
269 print_cfs_stats(m, cpu); 306 print_cfs_stats(m, cpu);
270 print_rt_stats(m, cpu); 307 print_rt_stats(m, cpu);
271 308
309 rcu_read_lock();
272 print_rq(m, rq, cpu); 310 print_rq(m, rq, cpu);
311 rcu_read_unlock();
312 spin_unlock_irqrestore(&sched_debug_lock, flags);
273} 313}
274 314
275static const char *sched_tunable_scaling_names[] = { 315static const char *sched_tunable_scaling_names[] = {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
699 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
700} 700}
701 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED 702#ifdef CONFIG_FAIR_GROUP_SCHED
703# ifdef CONFIG_SMP
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 704static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update) 705 int global_update)
705{ 706{
@@ -721,10 +722,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
721 u64 now, delta; 722 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight; 723 unsigned long load = cfs_rq->load.weight;
723 724
724 if (!cfs_rq) 725 if (cfs_rq->tg == &root_task_group)
725 return; 726 return;
726 727
727 now = rq_of(cfs_rq)->clock; 728 now = rq_of(cfs_rq)->clock_task;
728 delta = now - cfs_rq->load_stamp; 729 delta = now - cfs_rq->load_stamp;
729 730
730 /* truncate load history at 4 idle periods */ 731 /* truncate load history at 4 idle periods */
@@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
762 list_del_leaf_cfs_rq(cfs_rq); 763 list_del_leaf_cfs_rq(cfs_rq);
763} 764}
764 765
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
767 long weight_delta)
768{
769 long load_weight, load, shares;
770
771 load = cfs_rq->load.weight + weight_delta;
772
773 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load;
776
777 shares = (tg->shares * load);
778 if (load_weight)
779 shares /= load_weight;
780
781 if (shares < MIN_SHARES)
782 shares = MIN_SHARES;
783 if (shares > tg->shares)
784 shares = tg->shares;
785
786 return shares;
787}
788
789static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0);
794 }
795}
796# else /* CONFIG_SMP */
797static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{
799}
800
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
802 long weight_delta)
803{
804 return tg->shares;
805}
806
807static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
808{
809}
810# endif /* CONFIG_SMP */
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 811static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight) 812 unsigned long weight)
767{ 813{
@@ -782,41 +828,20 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{ 828{
783 struct task_group *tg; 829 struct task_group *tg;
784 struct sched_entity *se; 830 struct sched_entity *se;
785 long load_weight, load, shares; 831 long shares;
786
787 if (!cfs_rq)
788 return;
789 832
790 tg = cfs_rq->tg; 833 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))]; 834 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se) 835 if (!se)
793 return; 836 return;
794 837#ifndef CONFIG_SMP
795 load = cfs_rq->load.weight + weight_delta; 838 if (likely(se->load.weight == tg->shares))
796 839 return;
797 load_weight = atomic_read(&tg->load_weight); 840#endif
798 load_weight -= cfs_rq->load_contribution; 841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809 842
810 reweight_entity(cfs_rq_of(se), se, shares); 843 reweight_entity(cfs_rq_of(se), se, shares);
811} 844}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */ 845#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 846static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{ 847{
@@ -1062,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1062 struct sched_entity *se = __pick_next_entity(cfs_rq); 1087 struct sched_entity *se = __pick_next_entity(cfs_rq);
1063 s64 delta = curr->vruntime - se->vruntime; 1088 s64 delta = curr->vruntime - se->vruntime;
1064 1089
1090 if (delta < 0)
1091 return;
1092
1065 if (delta > ideal_runtime) 1093 if (delta > ideal_runtime)
1066 resched_task(rq_of(cfs_rq)->curr); 1094 resched_task(rq_of(cfs_rq)->curr);
1067 } 1095 }
@@ -1362,27 +1390,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1362 return wl; 1390 return wl;
1363 1391
1364 for_each_sched_entity(se) { 1392 for_each_sched_entity(se) {
1365 long S, rw, s, a, b; 1393 long lw, w;
1366 1394
1367 S = se->my_q->tg->shares; 1395 tg = se->my_q->tg;
1368 s = se->load.weight; 1396 w = se->my_q->load.weight;
1369 rw = se->my_q->load.weight;
1370 1397
1371 a = S*(rw + wl); 1398 /* use this cpu's instantaneous contribution */
1372 b = S*rw + s*wg; 1399 lw = atomic_read(&tg->load_weight);
1400 lw -= se->my_q->load_contribution;
1401 lw += w + wg;
1373 1402
1374 wl = s*(a-b); 1403 wl += w;
1375 1404
1376 if (likely(b)) 1405 if (lw > 0 && wl < lw)
1377 wl /= b; 1406 wl = (wl * tg->shares) / lw;
1407 else
1408 wl = tg->shares;
1378 1409
1379 /* 1410 /* zero point is MIN_SHARES */
1380 * Assume the group is already running and will 1411 if (wl < MIN_SHARES)
1381 * thus already be accounted for in the weight. 1412 wl = MIN_SHARES;
1382 * 1413 wl -= se->load.weight;
1383 * That is, moving shares between CPUs, does not
1384 * alter the group weight.
1385 */
1386 wg = 0; 1414 wg = 0;
1387 } 1415 }
1388 1416
@@ -1401,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1401 1429
1402static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1430static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1403{ 1431{
1404 unsigned long this_load, load; 1432 s64 this_load, load;
1405 int idx, this_cpu, prev_cpu; 1433 int idx, this_cpu, prev_cpu;
1406 unsigned long tl_per_task; 1434 unsigned long tl_per_task;
1407 struct task_group *tg; 1435 struct task_group *tg;
@@ -1440,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1440 * Otherwise check if either cpus are near enough in load to allow this 1468 * Otherwise check if either cpus are near enough in load to allow this
1441 * task to be woken on this_cpu. 1469 * task to be woken on this_cpu.
1442 */ 1470 */
1443 if (this_load) { 1471 if (this_load > 0) {
1444 unsigned long this_eff_load, prev_eff_load; 1472 s64 this_eff_load, prev_eff_load;
1445 1473
1446 this_eff_load = 100; 1474 this_eff_load = 100;
1447 this_eff_load *= power_of(prev_cpu); 1475 this_eff_load *= power_of(prev_cpu);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..ad6267714c84 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -625,7 +625,7 @@ static void update_curr_rt(struct rq *rq)
625 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 625 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
626 u64 delta_exec; 626 u64 delta_exec;
627 627
628 if (!task_has_rt_policy(curr)) 628 if (curr->sched_class != &rt_sched_class)
629 return; 629 return;
630 630
631 delta_exec = rq->clock_task - curr->se.exec_start; 631 delta_exec = rq->clock_task - curr->se.exec_start;
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
16static struct { 17static struct {
17 struct list_head queue; 18 struct list_head queue;
18 raw_spinlock_t lock; 19 raw_spinlock_t lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
193 */ 194 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs; 196 int refs;
197 void (*func) (void *info);
196 198
197 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) 199 /*
200 * Since we walk the list without any locks, we might
201 * see an entry that was completed, removed from the
202 * list and is in the process of being reused.
203 *
204 * We must check that the cpu is in the cpumask before
205 * checking the refs, and both must be set before
206 * executing the callback on this cpu.
207 */
208
209 if (!cpumask_test_cpu(cpu, data->cpumask))
210 continue;
211
212 smp_rmb();
213
214 if (atomic_read(&data->refs) == 0)
198 continue; 215 continue;
199 216
217 func = data->csd.func; /* for later warn */
200 data->csd.func(data->csd.info); 218 data->csd.func(data->csd.info);
201 219
220 /*
221 * If the cpu mask is not still set then it enabled interrupts,
222 * we took another smp interrupt, and executed the function
223 * twice on this cpu. In theory that copy decremented refs.
224 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n",
227 func);
228 continue;
229 }
230
202 refs = atomic_dec_return(&data->refs); 231 refs = atomic_dec_return(&data->refs);
203 WARN_ON(refs < 0); 232 WARN_ON(refs < 0);
204 if (!refs) {
205 raw_spin_lock(&call_function.lock);
206 list_del_rcu(&data->csd.list);
207 raw_spin_unlock(&call_function.lock);
208 }
209 233
210 if (refs) 234 if (refs)
211 continue; 235 continue;
212 236
237 WARN_ON(!cpumask_empty(data->cpumask));
238
239 raw_spin_lock(&call_function.lock);
240 list_del_rcu(&data->csd.list);
241 raw_spin_unlock(&call_function.lock);
242
213 csd_unlock(&data->csd); 243 csd_unlock(&data->csd);
214 } 244 }
215 245
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
429 * can't happen. 459 * can't happen.
430 */ 460 */
431 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
432 && !oops_in_progress); 462 && !oops_in_progress && !early_boot_irqs_disabled);
433 463
434 /* So, what's a CPU they want? Ignoring this one. */ 464 /* So, what's a CPU they want? Ignoring this one. */
435 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
453 483
454 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
455 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
456 487
457 data->csd.func = func; 488 data->csd.func = func;
458 data->csd.info = info; 489 data->csd.info = info;
459 cpumask_and(data->cpumask, mask, cpu_online_mask); 490 cpumask_and(data->cpumask, mask, cpu_online_mask);
460 cpumask_clear_cpu(this_cpu, data->cpumask); 491 cpumask_clear_cpu(this_cpu, data->cpumask);
492
493 /*
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb();
500
461 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 501 atomic_set(&data->refs, cpumask_weight(data->cpumask));
462 502
463 raw_spin_lock_irqsave(&call_function.lock, flags); 503 raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void)
529{ 569{
530 raw_spin_unlock_irq(&call_function.lock); 570 raw_spin_unlock_irq(&call_function.lock);
531} 571}
572#endif /* USE_GENERIC_SMP_HELPERS */
573
574/*
575 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
577 * of local_irq_disable/enable().
578 */
579int on_each_cpu(void (*func) (void *info), void *info, int wait)
580{
581 unsigned long flags;
582 int ret = 0;
583
584 preempt_disable();
585 ret = smp_call_function(func, info, wait);
586 local_irq_save(flags);
587 func(info);
588 local_irq_restore(flags);
589 preempt_enable();
590 return ret;
591}
592EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0823778f87fc..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
885} 885}
886early_initcall(spawn_ksoftirqd); 886early_initcall(spawn_ksoftirqd);
887 887
888#ifdef CONFIG_SMP
889/*
890 * Call a function on all processors
891 */
892int on_each_cpu(void (*func) (void *info), void *info, int wait)
893{
894 int ret = 0;
895
896 preempt_disable();
897 ret = smp_call_function(func, info, wait);
898 local_irq_disable();
899 func(info);
900 local_irq_enable();
901 preempt_enable();
902 return ret;
903}
904EXPORT_SYMBOL(on_each_cpu);
905#endif
906
907/* 888/*
908 * [ These __weak aliases are kept in a separate compilation unit, so that 889 * [ These __weak aliases are kept in a separate compilation unit, so that
909 * GCC does not inline them incorrectly. ] 890 * GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80edb..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 156EXPORT_SYMBOL_GPL(__srcu_read_unlock);
157 157
158/* 158/*
159 * We use an adaptive strategy for synchronize_srcu() and especially for
160 * synchronize_srcu_expedited(). We spin for a fixed time period
161 * (defined below) to allow SRCU readers to exit their read-side critical
162 * sections. If there are still some readers after 10 microseconds,
163 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter.
165 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10
167
168/*
159 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
160 */ 170 */
161static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
207 * will have finished executing. We initially give readers 217 * will have finished executing. We initially give readers
208 * an arbitrarily chosen 10 microseconds to get out of their 218 * an arbitrarily chosen 10 microseconds to get out of their
209 * SRCU read-side critical sections, then loop waiting 1/HZ 219 * SRCU read-side critical sections, then loop waiting 1/HZ
210 * seconds per iteration. 220 * seconds per iteration. The 10-microsecond value has done
221 * very well in testing.
211 */ 222 */
212 223
213 if (srcu_readers_active_idx(sp, idx)) 224 if (srcu_readers_active_idx(sp, idx))
214 udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); 225 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
215 while (srcu_readers_active_idx(sp, idx)) 226 while (srcu_readers_active_idx(sp, idx))
216 schedule_timeout_interruptible(1); 227 schedule_timeout_interruptible(1);
217 228
diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcdb6c6c..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
43#include <linux/kprobes.h> 43#include <linux/kprobes.h>
44#include <linux/user_namespace.h> 44#include <linux/user_namespace.h>
45 45
46#include <linux/kmsg_dump.h>
47
46#include <asm/uaccess.h> 48#include <asm/uaccess.h>
47#include <asm/io.h> 49#include <asm/io.h>
48#include <asm/unistd.h> 50#include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
285 */ 287 */
286void emergency_restart(void) 288void emergency_restart(void)
287{ 289{
290 kmsg_dump(KMSG_DUMP_EMERG);
288 machine_emergency_restart(); 291 machine_emergency_restart();
289} 292}
290EXPORT_SYMBOL_GPL(emergency_restart); 293EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
312 printk(KERN_EMERG "Restarting system.\n"); 315 printk(KERN_EMERG "Restarting system.\n");
313 else 316 else
314 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 317 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
318 kmsg_dump(KMSG_DUMP_RESTART);
315 machine_restart(cmd); 319 machine_restart(cmd);
316} 320}
317EXPORT_SYMBOL_GPL(kernel_restart); 321EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
333 kernel_shutdown_prepare(SYSTEM_HALT); 337 kernel_shutdown_prepare(SYSTEM_HALT);
334 sysdev_shutdown(); 338 sysdev_shutdown();
335 printk(KERN_EMERG "System halted.\n"); 339 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT);
336 machine_halt(); 341 machine_halt();
337} 342}
338 343
@@ -351,6 +356,7 @@ void kernel_power_off(void)
351 disable_nonboot_cpus(); 356 disable_nonboot_cpus();
352 sysdev_shutdown(); 357 sysdev_shutdown();
353 printk(KERN_EMERG "Power down.\n"); 358 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF);
354 machine_power_off(); 360 machine_power_off();
355} 361}
356EXPORT_SYMBOL_GPL(kernel_power_off); 362EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1379,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
1379 const struct cred *cred = current_cred(), *tcred; 1385 const struct cred *cred = current_cred(), *tcred;
1380 1386
1381 tcred = __task_cred(task); 1387 tcred = __task_cred(task);
1382 if ((cred->uid != tcred->euid || 1388 if (current != task &&
1389 (cred->uid != tcred->euid ||
1383 cred->uid != tcred->suid || 1390 cred->uid != tcred->suid ||
1384 cred->uid != tcred->uid || 1391 cred->uid != tcred->uid ||
1385 cred->gid != tcred->egid || 1392 cred->gid != tcred->egid ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..0f1bd83db985 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/printk.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/security.h> 29#include <linux/security.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
@@ -169,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write,
169#endif 170#endif
170 171
171#ifdef CONFIG_MAGIC_SYSRQ 172#ifdef CONFIG_MAGIC_SYSRQ
172static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ 173/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
173 175
174static int sysrq_sysctl_handler(ctl_table *table, int write, 176static int sysrq_sysctl_handler(ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, 177 void __user *buffer, size_t *lenp,
@@ -245,10 +247,6 @@ static struct ctl_table root_table[] = {
245 .mode = 0555, 247 .mode = 0555,
246 .child = dev_table, 248 .child = dev_table,
247 }, 249 },
248/*
249 * NOTE: do not add new entries to this table unless you have read
250 * Documentation/sysctl/ctl_unnumbered.txt
251 */
252 { } 250 { }
253}; 251};
254 252
@@ -710,6 +708,15 @@ static struct ctl_table kern_table[] = {
710 .extra1 = &zero, 708 .extra1 = &zero,
711 .extra2 = &one, 709 .extra2 = &one,
712 }, 710 },
711 {
712 .procname = "kptr_restrict",
713 .data = &kptr_restrict,
714 .maxlen = sizeof(int),
715 .mode = 0644,
716 .proc_handler = proc_dointvec_minmax,
717 .extra1 = &zero,
718 .extra2 = &two,
719 },
713#endif 720#endif
714 { 721 {
715 .procname = "ngroups_max", 722 .procname = "ngroups_max",
@@ -962,10 +969,6 @@ static struct ctl_table kern_table[] = {
962 .proc_handler = proc_dointvec, 969 .proc_handler = proc_dointvec,
963 }, 970 },
964#endif 971#endif
965/*
966 * NOTE: do not add new entries to this table unless you have read
967 * Documentation/sysctl/ctl_unnumbered.txt
968 */
969 { } 972 { }
970}; 973};
971 974
@@ -1326,11 +1329,6 @@ static struct ctl_table vm_table[] = {
1326 .extra2 = &one, 1329 .extra2 = &one,
1327 }, 1330 },
1328#endif 1331#endif
1329
1330/*
1331 * NOTE: do not add new entries to this table unless you have read
1332 * Documentation/sysctl/ctl_unnumbered.txt
1333 */
1334 { } 1332 { }
1335}; 1333};
1336 1334
@@ -1486,10 +1484,6 @@ static struct ctl_table fs_table[] = {
1486 .proc_handler = &pipe_proc_fn, 1484 .proc_handler = &pipe_proc_fn,
1487 .extra1 = &pipe_min_size, 1485 .extra1 = &pipe_min_size,
1488 }, 1486 },
1489/*
1490 * NOTE: do not add new entries to this table unless you have read
1491 * Documentation/sysctl/ctl_unnumbered.txt
1492 */
1493 { } 1487 { }
1494}; 1488};
1495 1489
@@ -2899,7 +2893,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2899 } 2893 }
2900} 2894}
2901 2895
2902#else /* CONFIG_PROC_FS */ 2896#else /* CONFIG_PROC_SYSCTL */
2903 2897
2904int proc_dostring(struct ctl_table *table, int write, 2898int proc_dostring(struct ctl_table *table, int write,
2905 void __user *buffer, size_t *lenp, loff_t *ppos) 2899 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2951,7 +2945,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2951} 2945}
2952 2946
2953 2947
2954#endif /* CONFIG_PROC_FS */ 2948#endif /* CONFIG_PROC_SYSCTL */
2955 2949
2956/* 2950/*
2957 * No sense putting this after each symbol definition, twice, 2951 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 4b2545a136ff..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1192 1192
1193 buf[result] = '\0'; 1193 buf[result] = '\0';
1194 1194
1195 /* Convert the decnet addresss to binary */ 1195 /* Convert the decnet address to binary */
1196 result = -EIO; 1196 result = -EIO;
1197 nodep = strchr(buf, '.') + 1; 1197 nodep = strchr(buf, '.') + 1;
1198 if (!nodep) 1198 if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 69691eb4b715..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -348,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
348 return ret; 348 return ret;
349} 349}
350 350
351#ifdef CONFIG_IA64 351#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
352#define TASKSTATS_NEEDS_PADDING 1 352#define TASKSTATS_NEEDS_PADDING 1
353#endif 353#endif
354 354
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
238 * Avoid unnecessary multiplications/divisions in the 238 * Avoid unnecessary multiplications/divisions in the
239 * two most common HZ cases: 239 * two most common HZ cases:
240 */ 240 */
241unsigned int inline jiffies_to_msecs(const unsigned long j) 241inline unsigned int jiffies_to_msecs(const unsigned long j)
242{ 242{
243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
244 return (MSEC_PER_SEC / HZ) * j; 244 return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
254} 254}
255EXPORT_SYMBOL(jiffies_to_msecs); 255EXPORT_SYMBOL(jiffies_to_msecs);
256 256
257unsigned int inline jiffies_to_usecs(const unsigned long j) 257inline unsigned int jiffies_to_usecs(const unsigned long j)
258{ 258{
259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
260 return (USEC_PER_SEC / HZ) * j; 260 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index df140cd3ea47..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
113 * @shift: pointer to shift variable 113 * @shift: pointer to shift variable
114 * @from: frequency to convert from 114 * @from: frequency to convert from
115 * @to: frequency to convert to 115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds 116 * @maxsec: guaranteed runtime conversion range in seconds
117 * 117 *
118 * The function evaluates the shift/mult pair for the scaled math 118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents. 119 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock 122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC. 123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 * 124 *
125 * The @minsec conversion range argument controls the time frame in 125 * The @maxsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the 126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit 127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is 128 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
131 * factors. 131 * factors.
132 */ 132 */
133void 133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) 134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
135{ 135{
136 u64 tmp; 136 u64 tmp;
137 u32 sft, sftacc= 32; 137 u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
140 * Calculate the shift factor which is limiting the conversion 140 * Calculate the shift factor which is limiting the conversion
141 * range: 141 * range:
142 */ 142 */
143 tmp = ((u64)minsec * from) >> 32; 143 tmp = ((u64)maxsec * from) >> 32;
144 while (tmp) { 144 while (tmp) {
145 tmp >>=1; 145 tmp >>=1;
146 sftacc--; 146 sftacc--;
@@ -679,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
679int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 679int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
680{ 680{
681 681
682 /* Intialize mult/shift and max_idle_ns */ 682 /* Initialize mult/shift and max_idle_ns */
683 __clocksource_updatefreq_scale(cs, scale, freq); 683 __clocksource_updatefreq_scale(cs, scale, freq);
684 684
685 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
14#include <linux/timex.h> 14#include <linux/timex.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h>
17 18
18/* 19/*
19 * NTP timekeeping variables: 20 * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long time_adjust;
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 75/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 76static s64 ntp_tick_adj;
76 77
78#ifdef CONFIG_NTP_PPS
79
80/*
81 * The following variables are used when a pulse-per-second (PPS) signal
82 * is available. They establish the engineering parameters of the clock
83 * discipline loop when controlled by the PPS signal.
84 */
85#define PPS_VALID 10 /* PPS signal watchdog max (s) */
86#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
87#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
88#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
89#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
90 increase pps_shift or consecutive bad
91 intervals to decrease it */
92#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
93
94static int pps_valid; /* signal watchdog counter */
95static long pps_tf[3]; /* phase median filter */
96static long pps_jitter; /* current jitter (ns) */
97static struct timespec pps_fbase; /* beginning of the last freq interval */
98static int pps_shift; /* current interval duration (s) (shift) */
99static int pps_intcnt; /* interval counter */
100static s64 pps_freq; /* frequency offset (scaled ns/s) */
101static long pps_stabil; /* current stability (scaled ns/s) */
102
103/*
104 * PPS signal quality monitors
105 */
106static long pps_calcnt; /* calibration intervals */
107static long pps_jitcnt; /* jitter limit exceeded */
108static long pps_stbcnt; /* stability limit exceeded */
109static long pps_errcnt; /* calibration errors */
110
111
112/* PPS kernel consumer compensates the whole phase error immediately.
113 * Otherwise, reduce the offset by a fixed factor times the time constant.
114 */
115static inline s64 ntp_offset_chunk(s64 offset)
116{
117 if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
118 return offset;
119 else
120 return shift_right(offset, SHIFT_PLL + time_constant);
121}
122
123static inline void pps_reset_freq_interval(void)
124{
125 /* the PPS calibration interval may end
126 surprisingly early */
127 pps_shift = PPS_INTMIN;
128 pps_intcnt = 0;
129}
130
131/**
132 * pps_clear - Clears the PPS state variables
133 *
134 * Must be called while holding a write on the xtime_lock
135 */
136static inline void pps_clear(void)
137{
138 pps_reset_freq_interval();
139 pps_tf[0] = 0;
140 pps_tf[1] = 0;
141 pps_tf[2] = 0;
142 pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
143 pps_freq = 0;
144}
145
146/* Decrease pps_valid to indicate that another second has passed since
147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
148 * missing.
149 *
150 * Must be called while holding a write on the xtime_lock
151 */
152static inline void pps_dec_valid(void)
153{
154 if (pps_valid > 0)
155 pps_valid--;
156 else {
157 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
158 STA_PPSWANDER | STA_PPSERROR);
159 pps_clear();
160 }
161}
162
163static inline void pps_set_freq(s64 freq)
164{
165 pps_freq = freq;
166}
167
168static inline int is_error_status(int status)
169{
170 return (time_status & (STA_UNSYNC|STA_CLOCKERR))
171 /* PPS signal lost when either PPS time or
172 * PPS frequency synchronization requested
173 */
174 || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
175 && !(time_status & STA_PPSSIGNAL))
176 /* PPS jitter exceeded when
177 * PPS time synchronization requested */
178 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
179 == (STA_PPSTIME|STA_PPSJITTER))
180 /* PPS wander exceeded or calibration error when
181 * PPS frequency synchronization requested
182 */
183 || ((time_status & STA_PPSFREQ)
184 && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
185}
186
187static inline void pps_fill_timex(struct timex *txc)
188{
189 txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
190 PPM_SCALE_INV, NTP_SCALE_SHIFT);
191 txc->jitter = pps_jitter;
192 if (!(time_status & STA_NANO))
193 txc->jitter /= NSEC_PER_USEC;
194 txc->shift = pps_shift;
195 txc->stabil = pps_stabil;
196 txc->jitcnt = pps_jitcnt;
197 txc->calcnt = pps_calcnt;
198 txc->errcnt = pps_errcnt;
199 txc->stbcnt = pps_stbcnt;
200}
201
202#else /* !CONFIG_NTP_PPS */
203
204static inline s64 ntp_offset_chunk(s64 offset)
205{
206 return shift_right(offset, SHIFT_PLL + time_constant);
207}
208
209static inline void pps_reset_freq_interval(void) {}
210static inline void pps_clear(void) {}
211static inline void pps_dec_valid(void) {}
212static inline void pps_set_freq(s64 freq) {}
213
214static inline int is_error_status(int status)
215{
216 return status & (STA_UNSYNC|STA_CLOCKERR);
217}
218
219static inline void pps_fill_timex(struct timex *txc)
220{
221 /* PPS is not implemented, so these are zero */
222 txc->ppsfreq = 0;
223 txc->jitter = 0;
224 txc->shift = 0;
225 txc->stabil = 0;
226 txc->jitcnt = 0;
227 txc->calcnt = 0;
228 txc->errcnt = 0;
229 txc->stbcnt = 0;
230}
231
232#endif /* CONFIG_NTP_PPS */
233
77/* 234/*
78 * NTP methods: 235 * NTP methods:
79 */ 236 */
@@ -185,6 +342,9 @@ void ntp_clear(void)
185 342
186 tick_length = tick_length_base; 343 tick_length = tick_length_base;
187 time_offset = 0; 344 time_offset = 0;
345
346 /* Clear PPS state variables */
347 pps_clear();
188} 348}
189 349
190/* 350/*
@@ -250,16 +410,16 @@ void second_overflow(void)
250 time_status |= STA_UNSYNC; 410 time_status |= STA_UNSYNC;
251 } 411 }
252 412
253 /* 413 /* Compute the phase adjustment for the next second */
254 * Compute the phase adjustment for the next second. The offset is
255 * reduced by a fixed factor times the time constant.
256 */
257 tick_length = tick_length_base; 414 tick_length = tick_length_base;
258 415
259 delta = shift_right(time_offset, SHIFT_PLL + time_constant); 416 delta = ntp_offset_chunk(time_offset);
260 time_offset -= delta; 417 time_offset -= delta;
261 tick_length += delta; 418 tick_length += delta;
262 419
420 /* Check PPS signal */
421 pps_dec_valid();
422
263 if (!time_adjust) 423 if (!time_adjust)
264 return; 424 return;
265 425
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
369 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 529 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
370 time_state = TIME_OK; 530 time_state = TIME_OK;
371 time_status = STA_UNSYNC; 531 time_status = STA_UNSYNC;
532 /* restart PPS frequency calibration */
533 pps_reset_freq_interval();
372 } 534 }
373 535
374 /* 536 /*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
418 time_freq = txc->freq * PPM_SCALE; 580 time_freq = txc->freq * PPM_SCALE;
419 time_freq = min(time_freq, MAXFREQ_SCALED); 581 time_freq = min(time_freq, MAXFREQ_SCALED);
420 time_freq = max(time_freq, -MAXFREQ_SCALED); 582 time_freq = max(time_freq, -MAXFREQ_SCALED);
583 /* update pps_freq */
584 pps_set_freq(time_freq);
421 } 585 }
422 586
423 if (txc->modes & ADJ_MAXERROR) 587 if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
508 } 672 }
509 673
510 result = time_state; /* mostly `TIME_OK' */ 674 result = time_state; /* mostly `TIME_OK' */
511 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 675 /* check for errors */
676 if (is_error_status(time_status))
512 result = TIME_ERROR; 677 result = TIME_ERROR;
513 678
514 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 679 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
522 txc->tick = tick_usec; 687 txc->tick = tick_usec;
523 txc->tai = time_tai; 688 txc->tai = time_tai;
524 689
525 /* PPS is not implemented, so these are zero */ 690 /* fill PPS status fields */
526 txc->ppsfreq = 0; 691 pps_fill_timex(txc);
527 txc->jitter = 0;
528 txc->shift = 0;
529 txc->stabil = 0;
530 txc->jitcnt = 0;
531 txc->calcnt = 0;
532 txc->errcnt = 0;
533 txc->stbcnt = 0;
534 692
535 write_sequnlock_irq(&xtime_lock); 693 write_sequnlock_irq(&xtime_lock);
536 694
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
544 return result; 702 return result;
545} 703}
546 704
705#ifdef CONFIG_NTP_PPS
706
707/* actually struct pps_normtime is good old struct timespec, but it is
708 * semantically different (and it is the reason why it was invented):
709 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
710 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
711struct pps_normtime {
712 __kernel_time_t sec; /* seconds */
713 long nsec; /* nanoseconds */
714};
715
716/* normalize the timestamp so that nsec is in the
717 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
718static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
719{
720 struct pps_normtime norm = {
721 .sec = ts.tv_sec,
722 .nsec = ts.tv_nsec
723 };
724
725 if (norm.nsec > (NSEC_PER_SEC >> 1)) {
726 norm.nsec -= NSEC_PER_SEC;
727 norm.sec++;
728 }
729
730 return norm;
731}
732
733/* get current phase correction and jitter */
734static inline long pps_phase_filter_get(long *jitter)
735{
736 *jitter = pps_tf[0] - pps_tf[1];
737 if (*jitter < 0)
738 *jitter = -*jitter;
739
740 /* TODO: test various filters */
741 return pps_tf[0];
742}
743
744/* add the sample to the phase filter */
745static inline void pps_phase_filter_add(long err)
746{
747 pps_tf[2] = pps_tf[1];
748 pps_tf[1] = pps_tf[0];
749 pps_tf[0] = err;
750}
751
752/* decrease frequency calibration interval length.
753 * It is halved after four consecutive unstable intervals.
754 */
755static inline void pps_dec_freq_interval(void)
756{
757 if (--pps_intcnt <= -PPS_INTCOUNT) {
758 pps_intcnt = -PPS_INTCOUNT;
759 if (pps_shift > PPS_INTMIN) {
760 pps_shift--;
761 pps_intcnt = 0;
762 }
763 }
764}
765
766/* increase frequency calibration interval length.
767 * It is doubled after four consecutive stable intervals.
768 */
769static inline void pps_inc_freq_interval(void)
770{
771 if (++pps_intcnt >= PPS_INTCOUNT) {
772 pps_intcnt = PPS_INTCOUNT;
773 if (pps_shift < PPS_INTMAX) {
774 pps_shift++;
775 pps_intcnt = 0;
776 }
777 }
778}
779
780/* update clock frequency based on MONOTONIC_RAW clock PPS signal
781 * timestamps
782 *
783 * At the end of the calibration interval the difference between the
784 * first and last MONOTONIC_RAW clock timestamps divided by the length
785 * of the interval becomes the frequency update. If the interval was
786 * too long, the data are discarded.
787 * Returns the difference between old and new frequency values.
788 */
789static long hardpps_update_freq(struct pps_normtime freq_norm)
790{
791 long delta, delta_mod;
792 s64 ftemp;
793
794 /* check if the frequency interval was too long */
795 if (freq_norm.sec > (2 << pps_shift)) {
796 time_status |= STA_PPSERROR;
797 pps_errcnt++;
798 pps_dec_freq_interval();
799 pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
800 freq_norm.sec);
801 return 0;
802 }
803
804 /* here the raw frequency offset and wander (stability) is
805 * calculated. If the wander is less than the wander threshold
806 * the interval is increased; otherwise it is decreased.
807 */
808 ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
809 freq_norm.sec);
810 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
811 pps_freq = ftemp;
812 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
813 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
814 time_status |= STA_PPSWANDER;
815 pps_stbcnt++;
816 pps_dec_freq_interval();
817 } else { /* good sample */
818 pps_inc_freq_interval();
819 }
820
821 /* the stability metric is calculated as the average of recent
822 * frequency changes, but is used only for performance
823 * monitoring
824 */
825 delta_mod = delta;
826 if (delta_mod < 0)
827 delta_mod = -delta_mod;
828 pps_stabil += (div_s64(((s64)delta_mod) <<
829 (NTP_SCALE_SHIFT - SHIFT_USEC),
830 NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
831
832 /* if enabled, the system clock frequency is updated */
833 if ((time_status & STA_PPSFREQ) != 0 &&
834 (time_status & STA_FREQHOLD) == 0) {
835 time_freq = pps_freq;
836 ntp_update_frequency();
837 }
838
839 return delta;
840}
841
842/* correct REALTIME clock phase error against PPS signal */
843static void hardpps_update_phase(long error)
844{
845 long correction = -error;
846 long jitter;
847
848 /* add the sample to the median filter */
849 pps_phase_filter_add(correction);
850 correction = pps_phase_filter_get(&jitter);
851
852 /* Nominal jitter is due to PPS signal noise. If it exceeds the
853 * threshold, the sample is discarded; otherwise, if so enabled,
854 * the time offset is updated.
855 */
856 if (jitter > (pps_jitter << PPS_POPCORN)) {
857 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
858 jitter, (pps_jitter << PPS_POPCORN));
859 time_status |= STA_PPSJITTER;
860 pps_jitcnt++;
861 } else if (time_status & STA_PPSTIME) {
862 /* correct the time using the phase offset */
863 time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
864 NTP_INTERVAL_FREQ);
865 /* cancel running adjtime() */
866 time_adjust = 0;
867 }
868 /* update jitter */
869 pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
870}
871
872/*
873 * hardpps() - discipline CPU clock oscillator to external PPS signal
874 *
875 * This routine is called at each PPS signal arrival in order to
876 * discipline the CPU clock oscillator to the PPS signal. It takes two
877 * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
878 * is used to correct clock phase error and the latter is used to
879 * correct the frequency.
880 *
881 * This code is based on David Mills's reference nanokernel
882 * implementation. It was mostly rewritten but keeps the same idea.
883 */
884void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
885{
886 struct pps_normtime pts_norm, freq_norm;
887 unsigned long flags;
888
889 pts_norm = pps_normalize_ts(*phase_ts);
890
891 write_seqlock_irqsave(&xtime_lock, flags);
892
893 /* clear the error bits, they will be set again if needed */
894 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
895
896 /* indicate signal presence */
897 time_status |= STA_PPSSIGNAL;
898 pps_valid = PPS_VALID;
899
900 /* when called for the first time,
901 * just start the frequency interval */
902 if (unlikely(pps_fbase.tv_sec == 0)) {
903 pps_fbase = *raw_ts;
904 write_sequnlock_irqrestore(&xtime_lock, flags);
905 return;
906 }
907
908 /* ok, now we have a base for frequency calculation */
909 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
910
911 /* check that the signal is in the range
912 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
913 if ((freq_norm.sec == 0) ||
914 (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
915 (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
916 time_status |= STA_PPSJITTER;
917 /* restart the frequency calibration interval */
918 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags);
920 pr_err("hardpps: PPSJITTER: bad pulse\n");
921 return;
922 }
923
924 /* signal is ok */
925
926 /* check if the current frequency interval is finished */
927 if (freq_norm.sec >= (1 << pps_shift)) {
928 pps_calcnt++;
929 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts;
931 hardpps_update_freq(freq_norm);
932 }
933
934 hardpps_update_phase(pts_norm.nsec);
935
936 write_sequnlock_irqrestore(&xtime_lock, flags);
937}
938EXPORT_SYMBOL(hardpps);
939
940#endif /* CONFIG_NTP_PPS */
941
547static int __init ntp_tick_adj_setup(char *str) 942static int __init ntp_tick_adj_setup(char *str)
548{ 943{
549 ntp_tick_adj = simple_strtol(str, NULL, 0); 944 ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..a3b5aff62606 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -600,4 +600,14 @@ int tick_broadcast_oneshot_active(void)
600 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; 600 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
601} 601}
602 602
603/*
604 * Check whether the broadcast device supports oneshot.
605 */
606bool tick_broadcast_oneshot_available(void)
607{
608 struct clock_event_device *bc = tick_broadcast_device.evtdev;
609
610 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
611}
612
603#endif 613#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..ed228ef6f6b8 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -51,7 +51,11 @@ int tick_is_oneshot_available(void)
51{ 51{
52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 53
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 54 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
55 return 0;
56 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
57 return 1;
58 return tick_broadcast_oneshot_available();
55} 59}
56 60
57/* 61/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..f65d3a723a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
36extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 36extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
37extern int tick_broadcast_oneshot_active(void); 37extern int tick_broadcast_oneshot_active(void);
38extern void tick_check_oneshot_broadcast(int cpu); 38extern void tick_check_oneshot_broadcast(int cpu);
39bool tick_broadcast_oneshot_available(void);
39# else /* BROADCAST */ 40# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 41static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 42{
@@ -46,6 +47,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 47static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 48static inline int tick_broadcast_oneshot_active(void) { return 0; }
48static inline void tick_check_oneshot_broadcast(int cpu) { } 49static inline void tick_check_oneshot_broadcast(int cpu) { }
50static inline bool tick_broadcast_oneshot_available(void) { return true; }
49# endif /* !BROADCAST */ 51# endif /* !BROADCAST */
50 52
51#else /* !ONESHOT */ 53#else /* !ONESHOT */
@@ -76,6 +78,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
76 return 0; 78 return 0;
77} 79}
78static inline int tick_broadcast_oneshot_active(void) { return 0; } 80static inline int tick_broadcast_oneshot_active(void) { return 0; }
81static inline bool tick_broadcast_oneshot_available(void) { return false; }
79#endif /* !TICK_ONESHOT */ 82#endif /* !TICK_ONESHOT */
80 83
81/* 84/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..c55ea2433471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void)
642 } 642 }
643 local_irq_enable(); 643 local_irq_enable();
644 644
645 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", 645 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
646 smp_processor_id());
647} 646}
648 647
649/* 648/*
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void)
795 } 794 }
796 795
797#ifdef CONFIG_NO_HZ 796#ifdef CONFIG_NO_HZ
798 if (tick_nohz_enabled) 797 if (tick_nohz_enabled) {
799 ts->nohz_mode = NOHZ_MODE_HIGHRES; 798 ts->nohz_mode = NOHZ_MODE_HIGHRES;
799 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
800 }
800#endif 801#endif
801} 802}
802#endif /* HIGH_RES_TIMERS */ 803#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da82003..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -49,7 +49,7 @@ struct timekeeper {
49 u32 mult; 49 u32 mult;
50}; 50};
51 51
52struct timekeeper timekeeper; 52static struct timekeeper timekeeper;
53 53
54/** 54/**
55 * timekeeper_setup_internals - Set up internals to use clocksource clock. 55 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time;
164/* 164/*
165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. 165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
166 */ 166 */
167struct timespec raw_time; 167static struct timespec raw_time;
168 168
169/* flag for if timekeeping is suspended */ 169/* flag for if timekeeping is suspended */
170int __read_mostly timekeeping_suspended; 170int __read_mostly timekeeping_suspended;
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
288} 288}
289EXPORT_SYMBOL_GPL(ktime_get_ts); 289EXPORT_SYMBOL_GPL(ktime_get_ts);
290 290
291#ifdef CONFIG_NTP_PPS
292
293/**
294 * getnstime_raw_and_real - get day and raw monotonic time in timespec format
295 * @ts_raw: pointer to the timespec to be set to raw monotonic time
296 * @ts_real: pointer to the timespec to be set to the time of day
297 *
298 * This function reads both the time of day and raw monotonic time at the
299 * same time atomically and stores the resulting timestamps in timespec
300 * format.
301 */
302void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
303{
304 unsigned long seq;
305 s64 nsecs_raw, nsecs_real;
306
307 WARN_ON_ONCE(timekeeping_suspended);
308
309 do {
310 u32 arch_offset;
311
312 seq = read_seqbegin(&xtime_lock);
313
314 *ts_raw = raw_time;
315 *ts_real = xtime;
316
317 nsecs_raw = timekeeping_get_ns_raw();
318 nsecs_real = timekeeping_get_ns();
319
320 /* If arch requires, add in gettimeoffset() */
321 arch_offset = arch_gettimeoffset();
322 nsecs_raw += arch_offset;
323 nsecs_real += arch_offset;
324
325 } while (read_seqretry(&xtime_lock, seq));
326
327 timespec_add_ns(ts_raw, nsecs_raw);
328 timespec_add_ns(ts_real, nsecs_real);
329}
330EXPORT_SYMBOL(getnstime_raw_and_real);
331
332#endif /* CONFIG_NTP_PPS */
333
291/** 334/**
292 * do_gettimeofday - Returns the time of day in a timeval 335 * do_gettimeofday - Returns the time of day in a timeval
293 * @tv: pointer to the timeval to be set 336 * @tv: pointer to the timeval to be set
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 32a19f9397fc..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
41 char symname[KSYM_NAME_LEN]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%pK>", sym);
45 else 45 else
46 SEQ_printf(m, "%s", symname); 46 SEQ_printf(m, "%s", symname);
47} 47}
@@ -112,7 +112,7 @@ next_one:
112static void 112static void
113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
114{ 114{
115 SEQ_printf(m, " .base: %p\n", base); 115 SEQ_printf(m, " .base: %pK\n", base);
116 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
117 base->index); 117 base->index);
118 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca9936f2d0..d6459923d245 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
959 * 959 *
960 * Synchronization rules: Callers must prevent restarting of the timer, 960 * Synchronization rules: Callers must prevent restarting of the timer,
961 * otherwise this function is meaningless. It must not be called from 961 * otherwise this function is meaningless. It must not be called from
962 * hardirq contexts. The caller must not hold locks which would prevent 962 * interrupt contexts. The caller must not hold locks which would prevent
963 * completion of the timer's handler. The timer's handler must not call 963 * completion of the timer's handler. The timer's handler must not call
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 964 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 965 * not running on any CPU.
@@ -969,10 +969,12 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
969int del_timer_sync(struct timer_list *timer) 969int del_timer_sync(struct timer_list *timer)
970{ 970{
971#ifdef CONFIG_LOCKDEP 971#ifdef CONFIG_LOCKDEP
972 local_bh_disable(); 972 unsigned long flags;
973
974 local_irq_save(flags);
973 lock_map_acquire(&timer->lockdep_map); 975 lock_map_acquire(&timer->lockdep_map);
974 lock_map_release(&timer->lockdep_map); 976 lock_map_release(&timer->lockdep_map);
975 local_bh_enable(); 977 local_irq_restore(flags);
976#endif 978#endif
977 /* 979 /*
978 * don't use it in hardirq context, because it 980 * don't use it in hardirq context, because it
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..cbafed7d4f38 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
138 !blk_tracer_enabled)) 138 !blk_tracer_enabled))
139 return; 139 return;
140 140
141 /*
142 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
143 * message to the trace.
144 */
145 if (!(bt->act_mask & BLK_TC_NOTIFY))
146 return;
147
141 local_irq_save(flags); 148 local_irq_save(flags);
142 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 149 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
143 va_start(args, fmt); 150 va_start(args, fmt);
@@ -758,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore,
758 * @q: queue the io is for 765 * @q: queue the io is for
759 * @bio: the source bio 766 * @bio: the source bio
760 * @what: the action 767 * @what: the action
768 * @error: error, if any
761 * 769 *
762 * Description: 770 * Description:
763 * Records an action against a bio. Will log the bio offset + size. 771 * Records an action against a bio. Will log the bio offset + size.
764 * 772 *
765 **/ 773 **/
766static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 774static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
767 u32 what) 775 u32 what, int error)
768{ 776{
769 struct blk_trace *bt = q->blk_trace; 777 struct blk_trace *bt = q->blk_trace;
770 778
771 if (likely(!bt)) 779 if (likely(!bt))
772 return; 780 return;
773 781
782 if (!error && !bio_flagged(bio, BIO_UPTODATE))
783 error = EIO;
784
774 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 785 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
775 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 786 error, 0, NULL);
776} 787}
777 788
778static void blk_add_trace_bio_bounce(void *ignore, 789static void blk_add_trace_bio_bounce(void *ignore,
779 struct request_queue *q, struct bio *bio) 790 struct request_queue *q, struct bio *bio)
780{ 791{
781 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 792 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
782} 793}
783 794
784static void blk_add_trace_bio_complete(void *ignore, 795static void blk_add_trace_bio_complete(void *ignore,
785 struct request_queue *q, struct bio *bio) 796 struct request_queue *q, struct bio *bio,
797 int error)
786{ 798{
787 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 799 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
788} 800}
789 801
790static void blk_add_trace_bio_backmerge(void *ignore, 802static void blk_add_trace_bio_backmerge(void *ignore,
791 struct request_queue *q, 803 struct request_queue *q,
792 struct bio *bio) 804 struct bio *bio)
793{ 805{
794 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 806 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
795} 807}
796 808
797static void blk_add_trace_bio_frontmerge(void *ignore, 809static void blk_add_trace_bio_frontmerge(void *ignore,
798 struct request_queue *q, 810 struct request_queue *q,
799 struct bio *bio) 811 struct bio *bio)
800{ 812{
801 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 813 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
802} 814}
803 815
804static void blk_add_trace_bio_queue(void *ignore, 816static void blk_add_trace_bio_queue(void *ignore,
805 struct request_queue *q, struct bio *bio) 817 struct request_queue *q, struct bio *bio)
806{ 818{
807 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 819 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
808} 820}
809 821
810static void blk_add_trace_getrq(void *ignore, 822static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore,
812 struct bio *bio, int rw) 824 struct bio *bio, int rw)
813{ 825{
814 if (bio) 826 if (bio)
815 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 827 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
816 else { 828 else {
817 struct blk_trace *bt = q->blk_trace; 829 struct blk_trace *bt = q->blk_trace;
818 830
@@ -827,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore,
827 struct bio *bio, int rw) 839 struct bio *bio, int rw)
828{ 840{
829 if (bio) 841 if (bio)
830 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 842 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
831 else { 843 else {
832 struct blk_trace *bt = q->blk_trace; 844 struct blk_trace *bt = q->blk_trace;
833 845
@@ -887,7 +899,7 @@ static void blk_add_trace_split(void *ignore,
887} 899}
888 900
889/** 901/**
890 * blk_add_trace_remap - Add a trace for a remap operation 902 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
891 * @ignore: trace callback data parameter (not used) 903 * @ignore: trace callback data parameter (not used)
892 * @q: queue the io is for 904 * @q: queue the io is for
893 * @bio: the source bio 905 * @bio: the source bio
@@ -899,9 +911,9 @@ static void blk_add_trace_split(void *ignore,
899 * it spans a stripe (or similar). Add a trace for that action. 911 * it spans a stripe (or similar). Add a trace for that action.
900 * 912 *
901 **/ 913 **/
902static void blk_add_trace_remap(void *ignore, 914static void blk_add_trace_bio_remap(void *ignore,
903 struct request_queue *q, struct bio *bio, 915 struct request_queue *q, struct bio *bio,
904 dev_t dev, sector_t from) 916 dev_t dev, sector_t from)
905{ 917{
906 struct blk_trace *bt = q->blk_trace; 918 struct blk_trace *bt = q->blk_trace;
907 struct blk_io_trace_remap r; 919 struct blk_io_trace_remap r;
@@ -1016,7 +1028,7 @@ static void blk_register_tracepoints(void)
1016 WARN_ON(ret); 1028 WARN_ON(ret);
1017 ret = register_trace_block_split(blk_add_trace_split, NULL); 1029 ret = register_trace_block_split(blk_add_trace_split, NULL);
1018 WARN_ON(ret); 1030 WARN_ON(ret);
1019 ret = register_trace_block_remap(blk_add_trace_remap, NULL); 1031 ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1020 WARN_ON(ret); 1032 WARN_ON(ret);
1021 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1033 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1022 WARN_ON(ret); 1034 WARN_ON(ret);
@@ -1025,7 +1037,7 @@ static void blk_register_tracepoints(void)
1025static void blk_unregister_tracepoints(void) 1037static void blk_unregister_tracepoints(void)
1026{ 1038{
1027 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1039 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1028 unregister_trace_block_remap(blk_add_trace_remap, NULL); 1040 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1029 unregister_trace_block_split(blk_add_trace_split, NULL); 1041 unregister_trace_block_split(blk_add_trace_split, NULL);
1030 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1042 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1031 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1043 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
@@ -1815,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1815 rwbs[i] = '\0'; 1827 rwbs[i] = '\0';
1816} 1828}
1817 1829
1818void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1819{
1820 int rw = rq->cmd_flags & 0x03;
1821 int bytes;
1822
1823 if (rq->cmd_flags & REQ_DISCARD)
1824 rw |= REQ_DISCARD;
1825
1826 if (rq->cmd_flags & REQ_SECURE)
1827 rw |= REQ_SECURE;
1828
1829 bytes = blk_rq_bytes(rq);
1830
1831 blk_fill_rwbs(rwbs, rw, bytes);
1832}
1833
1834#endif /* CONFIG_EVENT_TRACING */ 1830#endif /* CONFIG_EVENT_TRACING */
1835 1831
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1313 1313
1314 __this_cpu_inc(user_stack_count); 1314 __this_cpu_inc(user_stack_count);
1315 1315
1316
1317
1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1316 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1319 sizeof(*entry), flags, pc); 1317 sizeof(*entry), flags, pc);
1320 if (!event) 1318 if (!event)
1321 return; 1319 goto out_drop_count;
1322 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1323 1321
1324 entry->tgid = current->tgid; 1322 entry->tgid = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1333 if (!filter_check_discard(call, entry, buffer, event)) 1331 if (!filter_check_discard(call, entry, buffer, event))
1334 ring_buffer_unlock_commit(buffer, event); 1332 ring_buffer_unlock_commit(buffer, event);
1335 1333
1334 out_drop_count:
1336 __this_cpu_dec(user_stack_count); 1335 __this_cpu_dec(user_stack_count);
1337
1338 out: 1336 out:
1339 preempt_enable(); 1337 preempt_enable();
1340} 1338}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
53 */ 53 */
54 54
55/* 55/*
56 * Function trace entry - function address and parent function addres: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 35fde09b81de..5f499e0438a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod)
1284static void trace_module_add_events(struct module *mod) 1284static void trace_module_add_events(struct module *mod)
1285{ 1285{
1286 struct ftrace_module_file_ops *file_ops = NULL; 1286 struct ftrace_module_file_ops *file_ops = NULL;
1287 struct ftrace_event_call *call, *start, *end; 1287 struct ftrace_event_call **call, **start, **end;
1288 1288
1289 start = mod->trace_events; 1289 start = mod->trace_events;
1290 end = mod->trace_events + mod->num_trace_events; 1290 end = mod->trace_events + mod->num_trace_events;
@@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod)
1297 return; 1297 return;
1298 1298
1299 for_each_event(call, start, end) { 1299 for_each_event(call, start, end) {
1300 __trace_add_event_call(call, mod, 1300 __trace_add_event_call(*call, mod,
1301 &file_ops->id, &file_ops->enable, 1301 &file_ops->id, &file_ops->enable,
1302 &file_ops->filter, &file_ops->format); 1302 &file_ops->filter, &file_ops->format);
1303 } 1303 }
@@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = {
1367 .priority = 0, 1367 .priority = 0,
1368}; 1368};
1369 1369
1370extern struct ftrace_event_call __start_ftrace_events[]; 1370extern struct ftrace_event_call *__start_ftrace_events[];
1371extern struct ftrace_event_call __stop_ftrace_events[]; 1371extern struct ftrace_event_call *__stop_ftrace_events[];
1372 1372
1373static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; 1373static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1374 1374
@@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event);
1384 1384
1385static __init int event_trace_init(void) 1385static __init int event_trace_init(void)
1386{ 1386{
1387 struct ftrace_event_call *call; 1387 struct ftrace_event_call **call;
1388 struct dentry *d_tracer; 1388 struct dentry *d_tracer;
1389 struct dentry *entry; 1389 struct dentry *entry;
1390 struct dentry *d_events; 1390 struct dentry *d_events;
@@ -1430,7 +1430,7 @@ static __init int event_trace_init(void)
1430 pr_warning("tracing: Failed to allocate common fields"); 1430 pr_warning("tracing: Failed to allocate common fields");
1431 1431
1432 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1432 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1433 __trace_add_event_call(call, NULL, &ftrace_event_id_fops, 1433 __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
1434 &ftrace_enable_fops, 1434 &ftrace_enable_fops,
1435 &ftrace_event_filter_fops, 1435 &ftrace_event_filter_fops,
1436 &ftrace_event_format_fops); 1436 &ftrace_event_format_fops);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4b74d71705c0..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \
161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 162}; \
163 \ 163 \
164struct ftrace_event_call __used \ 164struct ftrace_event_call __used event_##call = { \
165__attribute__((__aligned__(4))) \
166__attribute__((section("_ftrace_events"))) event_##call = { \
167 .name = #call, \ 165 .name = #call, \
168 .event.type = etype, \ 166 .event.type = etype, \
169 .class = &event_class_ftrace_##call, \ 167 .class = &event_class_ftrace_##call, \
170 .print_fmt = print, \ 168 .print_fmt = print, \
171}; \ 169}; \
170struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
172 172
173#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
453 * Stubs: 453 * Stubs:
454 */ 454 */
455 455
456void early_boot_irqs_off(void)
457{
458}
459
460void early_boot_irqs_on(void)
461{
462}
463
464void trace_softirqs_on(unsigned long ip) 456void trace_softirqs_on(unsigned long ip)
465{ 457{
466} 458}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..5c9fe08d2093 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
29static struct list_head * 26static struct list_head *
30syscall_get_enter_fields(struct ftrace_event_call *call) 27syscall_get_enter_fields(struct ftrace_event_call *call)
31{ 28{
@@ -34,50 +31,45 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34 return &entry->enter_fields; 31 return &entry->enter_fields;
35} 32}
36 33
37static struct list_head *
38syscall_get_exit_fields(struct ftrace_event_call *call)
39{
40 return &syscall_exit_fields;
41}
42
43struct trace_event_functions enter_syscall_print_funcs = { 34struct trace_event_functions enter_syscall_print_funcs = {
44 .trace = print_syscall_enter, 35 .trace = print_syscall_enter,
45}; 36};
46 37
47struct trace_event_functions exit_syscall_print_funcs = { 38struct trace_event_functions exit_syscall_print_funcs = {
48 .trace = print_syscall_exit, 39 .trace = print_syscall_exit,
49}; 40};
50 41
51struct ftrace_event_class event_class_syscall_enter = { 42struct ftrace_event_class event_class_syscall_enter = {
52 .system = "syscalls", 43 .system = "syscalls",
53 .reg = syscall_enter_register, 44 .reg = syscall_enter_register,
54 .define_fields = syscall_enter_define_fields, 45 .define_fields = syscall_enter_define_fields,
55 .get_fields = syscall_get_enter_fields, 46 .get_fields = syscall_get_enter_fields,
56 .raw_init = init_syscall_trace, 47 .raw_init = init_syscall_trace,
57}; 48};
58 49
59struct ftrace_event_class event_class_syscall_exit = { 50struct ftrace_event_class event_class_syscall_exit = {
60 .system = "syscalls", 51 .system = "syscalls",
61 .reg = syscall_exit_register, 52 .reg = syscall_exit_register,
62 .define_fields = syscall_exit_define_fields, 53 .define_fields = syscall_exit_define_fields,
63 .get_fields = syscall_get_exit_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
64 .raw_init = init_syscall_trace, 55 .raw_init = init_syscall_trace,
65}; 56};
66 57
67extern unsigned long __start_syscalls_metadata[]; 58extern struct syscall_metadata *__start_syscalls_metadata[];
68extern unsigned long __stop_syscalls_metadata[]; 59extern struct syscall_metadata *__stop_syscalls_metadata[];
69 60
70static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
71 62
72static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 63static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall)
73{ 65{
74 struct syscall_metadata *start; 66 struct syscall_metadata **start;
75 struct syscall_metadata *stop; 67 struct syscall_metadata **stop;
76 char str[KSYM_SYMBOL_LEN]; 68 char str[KSYM_SYMBOL_LEN];
77 69
78 70
79 start = (struct syscall_metadata *)__start_syscalls_metadata; 71 start = __start_syscalls_metadata;
80 stop = (struct syscall_metadata *)__stop_syscalls_metadata; 72 stop = __stop_syscalls_metadata;
81 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 73 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
82 74
83 for ( ; start < stop; start++) { 75 for ( ; start < stop; start++) {
@@ -87,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
87 * with "SyS" instead of "sys", leading to an unwanted 79 * with "SyS" instead of "sys", leading to an unwanted
88 * mismatch. 80 * mismatch.
89 */ 81 */
90 if (start->name && !strcmp(start->name + 3, str + 3)) 82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
91 return start; 83 return *start;
92 } 84 }
93 return NULL; 85 return NULL;
94} 86}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d43..68187af4889e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h> 28#include <linux/jump_label.h>
29 29
30extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
32 32
33/* Set to 1 to enable tracepoint debug output */ 33/* Set to 1 to enable tracepoint debug output */
34static const int tracepoint_debug; 34static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
298 * 298 *
299 * Updates the probe callback corresponding to a range of tracepoints. 299 * Updates the probe callback corresponding to a range of tracepoints.
300 */ 300 */
301void 301void tracepoint_update_probe_range(struct tracepoint * const *begin,
302tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) 302 struct tracepoint * const *end)
303{ 303{
304 struct tracepoint *iter; 304 struct tracepoint * const *iter;
305 struct tracepoint_entry *mark_entry; 305 struct tracepoint_entry *mark_entry;
306 306
307 if (!begin) 307 if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
309 309
310 mutex_lock(&tracepoints_mutex); 310 mutex_lock(&tracepoints_mutex);
311 for (iter = begin; iter < end; iter++) { 311 for (iter = begin; iter < end; iter++) {
312 mark_entry = get_tracepoint(iter->name); 312 mark_entry = get_tracepoint((*iter)->name);
313 if (mark_entry) { 313 if (mark_entry) {
314 set_tracepoint(&mark_entry, iter, 314 set_tracepoint(&mark_entry, *iter,
315 !!mark_entry->refcount); 315 !!mark_entry->refcount);
316 } else { 316 } else {
317 disable_tracepoint(iter); 317 disable_tracepoint(*iter);
318 } 318 }
319 } 319 }
320 mutex_unlock(&tracepoints_mutex); 320 mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
326static void tracepoint_update_probes(void) 326static void tracepoint_update_probes(void)
327{ 327{
328 /* Core kernel tracepoints */ 328 /* Core kernel tracepoints */
329 tracepoint_update_probe_range(__start___tracepoints, 329 tracepoint_update_probe_range(__start___tracepoints_ptrs,
330 __stop___tracepoints); 330 __stop___tracepoints_ptrs);
331 /* tracepoints in modules. */ 331 /* tracepoints in modules. */
332 module_update_tracepoints(); 332 module_update_tracepoints();
333} 333}
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
514 * Will return the first tracepoint in the range if the input tracepoint is 514 * Will return the first tracepoint in the range if the input tracepoint is
515 * NULL. 515 * NULL.
516 */ 516 */
517int tracepoint_get_iter_range(struct tracepoint **tracepoint, 517int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
518 struct tracepoint *begin, struct tracepoint *end) 518 struct tracepoint * const *begin, struct tracepoint * const *end)
519{ 519{
520 if (!*tracepoint && begin != end) { 520 if (!*tracepoint && begin != end) {
521 *tracepoint = begin; 521 *tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
534 /* Core kernel tracepoints */ 534 /* Core kernel tracepoints */
535 if (!iter->module) { 535 if (!iter->module) {
536 found = tracepoint_get_iter_range(&iter->tracepoint, 536 found = tracepoint_get_iter_range(&iter->tracepoint,
537 __start___tracepoints, __stop___tracepoints); 537 __start___tracepoints_ptrs,
538 __stop___tracepoints_ptrs);
538 if (found) 539 if (found)
539 goto end; 540 goto end;
540 } 541 }
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
585 switch (val) { 586 switch (val) {
586 case MODULE_STATE_COMING: 587 case MODULE_STATE_COMING:
587 case MODULE_STATE_GOING: 588 case MODULE_STATE_GOING:
588 tracepoint_update_probe_range(mod->tracepoints, 589 tracepoint_update_probe_range(mod->tracepoints_ptrs,
589 mod->tracepoints + mod->num_tracepoints); 590 mod->tracepoints_ptrs + mod->num_tracepoints);
590 break; 591 break;
591 } 592 }
592 return 0; 593 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14 14
15static struct kmem_cache *user_ns_cachep __read_mostly;
16
15/* 17/*
16 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
17 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
26 struct user_struct *root_user; 28 struct user_struct *root_user;
27 int n; 29 int n;
28 30
29 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
30 if (!ns) 32 if (!ns)
31 return -ENOMEM; 33 return -ENOMEM;
32 34
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
38 /* Alloc new root user. */ 40 /* Alloc new root user. */
39 root_user = alloc_uid(ns, 0); 41 root_user = alloc_uid(ns, 0);
40 if (!root_user) { 42 if (!root_user) {
41 kfree(ns); 43 kmem_cache_free(user_ns_cachep, ns);
42 return -ENOMEM; 44 return -ENOMEM;
43 } 45 }
44 46
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
71 struct user_namespace *ns = 73 struct user_namespace *ns =
72 container_of(work, struct user_namespace, destroyer); 74 container_of(work, struct user_namespace, destroyer);
73 free_uid(ns->creator); 75 free_uid(ns->creator);
74 kfree(ns); 76 kmem_cache_free(user_ns_cachep, ns);
75} 77}
76 78
77void free_user_ns(struct kref *kref) 79void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
126 /* No useful relationship so no mapping */ 128 /* No useful relationship so no mapping */
127 return overflowgid; 129 return overflowgid;
128} 130}
131
132static __init int user_namespaces_init(void)
133{
134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
135 return 0;
136}
137module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d7ebdf4cea98..18bb15776c57 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
27#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int no_watchdog;
47
48
49/* boot commands */ 46/* boot commands */
50/* 47/*
51 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str)
58 if (!strncmp(str, "panic", 5)) 55 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1; 56 hardlockup_panic = 1;
60 else if (!strncmp(str, "0", 1)) 57 else if (!strncmp(str, "0", 1))
61 no_watchdog = 1; 58 watchdog_enabled = 0;
62 return 1; 59 return 1;
63} 60}
64__setup("nmi_watchdog=", hardlockup_panic_setup); 61__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
77 74
78static int __init nowatchdog_setup(char *str) 75static int __init nowatchdog_setup(char *str)
79{ 76{
80 no_watchdog = 1; 77 watchdog_enabled = 0;
81 return 1; 78 return 1;
82} 79}
83__setup("nowatchdog", nowatchdog_setup); 80__setup("nowatchdog", nowatchdog_setup);
@@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup);
85/* deprecated */ 82/* deprecated */
86static int __init nosoftlockup_setup(char *str) 83static int __init nosoftlockup_setup(char *str)
87{ 84{
88 no_watchdog = 1; 85 watchdog_enabled = 0;
89 return 1; 86 return 1;
90} 87}
91__setup("nosoftlockup", nosoftlockup_setup); 88__setup("nosoftlockup", nosoftlockup_setup);
@@ -366,8 +363,14 @@ static int watchdog_nmi_enable(int cpu)
366 goto out_save; 363 goto out_save;
367 } 364 }
368 365
369 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", 366
370 cpu, PTR_ERR(event)); 367 /* vary the KERN level based on the returned errno */
368 if (PTR_ERR(event) == -EOPNOTSUPP)
369 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
370 else if (PTR_ERR(event) == -ENOENT)
371 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
372 else
373 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
371 return PTR_ERR(event); 374 return PTR_ERR(event);
372 375
373 /* success path */ 376 /* success path */
@@ -432,9 +435,6 @@ static int watchdog_enable(int cpu)
432 wake_up_process(p); 435 wake_up_process(p);
433 } 436 }
434 437
435 /* if any cpu succeeds, watchdog is considered enabled for the system */
436 watchdog_enabled = 1;
437
438 return 0; 438 return 0;
439} 439}
440 440
@@ -462,12 +462,16 @@ static void watchdog_disable(int cpu)
462static void watchdog_enable_all_cpus(void) 462static void watchdog_enable_all_cpus(void)
463{ 463{
464 int cpu; 464 int cpu;
465 int result = 0; 465
466 watchdog_enabled = 0;
466 467
467 for_each_online_cpu(cpu) 468 for_each_online_cpu(cpu)
468 result += watchdog_enable(cpu); 469 if (!watchdog_enable(cpu))
470 /* if any cpu succeeds, watchdog is considered
471 enabled for the system */
472 watchdog_enabled = 1;
469 473
470 if (result) 474 if (!watchdog_enabled)
471 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 475 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
472 476
473} 477}
@@ -476,9 +480,6 @@ static void watchdog_disable_all_cpus(void)
476{ 480{
477 int cpu; 481 int cpu;
478 482
479 if (no_watchdog)
480 return;
481
482 for_each_online_cpu(cpu) 483 for_each_online_cpu(cpu)
483 watchdog_disable(cpu); 484 watchdog_disable(cpu);
484 485
@@ -498,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
498{ 499{
499 proc_dointvec(table, write, buffer, length, ppos); 500 proc_dointvec(table, write, buffer, length, ppos);
500 501
501 if (watchdog_enabled) 502 if (write) {
502 watchdog_enable_all_cpus(); 503 if (watchdog_enabled)
503 else 504 watchdog_enable_all_cpus();
504 watchdog_disable_all_cpus(); 505 else
506 watchdog_disable_all_cpus();
507 }
505 return 0; 508 return 0;
506} 509}
507 510
@@ -530,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
530 break; 533 break;
531 case CPU_ONLINE: 534 case CPU_ONLINE:
532 case CPU_ONLINE_FROZEN: 535 case CPU_ONLINE_FROZEN:
533 err = watchdog_enable(hotcpu); 536 if (watchdog_enabled)
537 err = watchdog_enable(hotcpu);
534 break; 538 break;
535#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
536 case CPU_UP_CANCELED: 540 case CPU_UP_CANCELED:
@@ -555,9 +559,6 @@ void __init lockup_detector_init(void)
555 void *cpu = (void *)(long)smp_processor_id(); 559 void *cpu = (void *)(long)smp_processor_id();
556 int err; 560 int err;
557 561
558 if (no_watchdog)
559 return;
560
561 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 562 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
562 WARN_ON(notifier_to_errno(err)); 563 WARN_ON(notifier_to_errno(err));
563 564
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec82f88a..ee6578b578ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
81 81
82 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ 82 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
83 /* call for help after 10ms
84 (min two ticks) */
83 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
84 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 86 CREATE_COOLDOWN = HZ, /* time to breath after fail */
85 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ 87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -768,7 +770,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
768 770
769 worker->flags &= ~flags; 771 worker->flags &= ~flags;
770 772
771 /* if transitioning out of NOT_RUNNING, increment nr_running */ 773 /*
774 * If transitioning out of NOT_RUNNING, increment nr_running. Note
775 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
776 * of multiple flags, not a single flag.
777 */
772 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 778 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
773 if (!(worker->flags & WORKER_NOT_RUNNING)) 779 if (!(worker->flags & WORKER_NOT_RUNNING))
774 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 780 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -1840,7 +1846,7 @@ __acquires(&gcwq->lock)
1840 spin_unlock_irq(&gcwq->lock); 1846 spin_unlock_irq(&gcwq->lock);
1841 1847
1842 work_clear_pending(work); 1848 work_clear_pending(work);
1843 lock_map_acquire(&cwq->wq->lockdep_map); 1849 lock_map_acquire_read(&cwq->wq->lockdep_map);
1844 lock_map_acquire(&lockdep_map); 1850 lock_map_acquire(&lockdep_map);
1845 trace_workqueue_execute_start(work); 1851 trace_workqueue_execute_start(work);
1846 f(work); 1852 f(work);
@@ -2043,6 +2049,15 @@ repeat:
2043 move_linked_works(work, scheduled, &n); 2049 move_linked_works(work, scheduled, &n);
2044 2050
2045 process_scheduled_works(rescuer); 2051 process_scheduled_works(rescuer);
2052
2053 /*
2054 * Leave this gcwq. If keep_working() is %true, notify a
2055 * regular worker; otherwise, we end up with 0 concurrency
2056 * and stalling the execution.
2057 */
2058 if (keep_working(gcwq))
2059 wake_up_worker(gcwq);
2060
2046 spin_unlock_irq(&gcwq->lock); 2061 spin_unlock_irq(&gcwq->lock);
2047 } 2062 }
2048 2063
@@ -2384,8 +2399,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2384 insert_wq_barrier(cwq, barr, work, worker); 2399 insert_wq_barrier(cwq, barr, work, worker);
2385 spin_unlock_irq(&gcwq->lock); 2400 spin_unlock_irq(&gcwq->lock);
2386 2401
2387 lock_map_acquire(&cwq->wq->lockdep_map); 2402 /*
2403 * If @max_active is 1 or rescuer is in use, flushing another work
2404 * item on the same workqueue may lead to deadlock. Make sure the
2405 * flusher is not running on the same workqueue by verifying write
2406 * access.
2407 */
2408 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2409 lock_map_acquire(&cwq->wq->lockdep_map);
2410 else
2411 lock_map_acquire_read(&cwq->wq->lockdep_map);
2388 lock_map_release(&cwq->wq->lockdep_map); 2412 lock_map_release(&cwq->wq->lockdep_map);
2413
2389 return true; 2414 return true;
2390already_gone: 2415already_gone:
2391 spin_unlock_irq(&gcwq->lock); 2416 spin_unlock_irq(&gcwq->lock);
@@ -2942,7 +2967,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2942 */ 2967 */
2943 spin_lock(&workqueue_lock); 2968 spin_lock(&workqueue_lock);
2944 2969
2945 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) 2970 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2946 for_each_cwq_cpu(cpu, wq) 2971 for_each_cwq_cpu(cpu, wq)
2947 get_cwq(cpu, wq)->max_active = 0; 2972 get_cwq(cpu, wq)->max_active = 0;
2948 2973
@@ -3054,7 +3079,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3054 3079
3055 spin_lock_irq(&gcwq->lock); 3080 spin_lock_irq(&gcwq->lock);
3056 3081
3057 if (!(wq->flags & WQ_FREEZEABLE) || 3082 if (!(wq->flags & WQ_FREEZABLE) ||
3058 !(gcwq->flags & GCWQ_FREEZING)) 3083 !(gcwq->flags & GCWQ_FREEZING))
3059 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3084 get_cwq(gcwq->cpu, wq)->max_active = max_active;
3060 3085
@@ -3304,7 +3329,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
3304 * want to get it over with ASAP - spam rescuers, wake up as 3329 * want to get it over with ASAP - spam rescuers, wake up as
3305 * many idlers as necessary and create new ones till the 3330 * many idlers as necessary and create new ones till the
3306 * worklist is empty. Note that if the gcwq is frozen, there 3331 * worklist is empty. Note that if the gcwq is frozen, there
3307 * may be frozen works in freezeable cwqs. Don't declare 3332 * may be frozen works in freezable cwqs. Don't declare
3308 * completion while frozen. 3333 * completion while frozen.
3309 */ 3334 */
3310 while (gcwq->nr_workers != gcwq->nr_idle || 3335 while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3562,9 +3587,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3562/** 3587/**
3563 * freeze_workqueues_begin - begin freezing workqueues 3588 * freeze_workqueues_begin - begin freezing workqueues
3564 * 3589 *
3565 * Start freezing workqueues. After this function returns, all 3590 * Start freezing workqueues. After this function returns, all freezable
3566 * freezeable workqueues will queue new works to their frozen_works 3591 * workqueues will queue new works to their frozen_works list instead of
3567 * list instead of gcwq->worklist. 3592 * gcwq->worklist.
3568 * 3593 *
3569 * CONTEXT: 3594 * CONTEXT:
3570 * Grabs and releases workqueue_lock and gcwq->lock's. 3595 * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3590,7 +3615,7 @@ void freeze_workqueues_begin(void)
3590 list_for_each_entry(wq, &workqueues, list) { 3615 list_for_each_entry(wq, &workqueues, list) {
3591 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3616 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3592 3617
3593 if (cwq && wq->flags & WQ_FREEZEABLE) 3618 if (cwq && wq->flags & WQ_FREEZABLE)
3594 cwq->max_active = 0; 3619 cwq->max_active = 0;
3595 } 3620 }
3596 3621
@@ -3601,7 +3626,7 @@ void freeze_workqueues_begin(void)
3601} 3626}
3602 3627
3603/** 3628/**
3604 * freeze_workqueues_busy - are freezeable workqueues still busy? 3629 * freeze_workqueues_busy - are freezable workqueues still busy?
3605 * 3630 *
3606 * Check whether freezing is complete. This function must be called 3631 * Check whether freezing is complete. This function must be called
3607 * between freeze_workqueues_begin() and thaw_workqueues(). 3632 * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3610,8 +3635,8 @@ void freeze_workqueues_begin(void)
3610 * Grabs and releases workqueue_lock. 3635 * Grabs and releases workqueue_lock.
3611 * 3636 *
3612 * RETURNS: 3637 * RETURNS:
3613 * %true if some freezeable workqueues are still busy. %false if 3638 * %true if some freezable workqueues are still busy. %false if freezing
3614 * freezing is complete. 3639 * is complete.
3615 */ 3640 */
3616bool freeze_workqueues_busy(void) 3641bool freeze_workqueues_busy(void)
3617{ 3642{
@@ -3631,7 +3656,7 @@ bool freeze_workqueues_busy(void)
3631 list_for_each_entry(wq, &workqueues, list) { 3656 list_for_each_entry(wq, &workqueues, list) {
3632 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3657 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3633 3658
3634 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3659 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3635 continue; 3660 continue;
3636 3661
3637 BUG_ON(cwq->nr_active < 0); 3662 BUG_ON(cwq->nr_active < 0);
@@ -3676,7 +3701,7 @@ void thaw_workqueues(void)
3676 list_for_each_entry(wq, &workqueues, list) { 3701 list_for_each_entry(wq, &workqueues, list) {
3677 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3702 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3678 3703
3679 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3704 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3680 continue; 3705 continue;
3681 3706
3682 /* restore max_active and repopulate worklist */ 3707 /* restore max_active and repopulate worklist */