aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorRussell King <rmk@dyn-67.arm.linux.org.uk>2008-05-09 18:24:09 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2008-05-09 18:24:09 -0400
commit1f2ee6496b1f71e9d5aa2448745e65fbafdc3bd5 (patch)
tree3f143311afca5e316afd06c2fc4f7d73b19cdcf0 /kernel
parent5bf6c6e30d8b71d092e8830208e182d84b907fcd (diff)
parentda109897a142dd017172c0ce7abf0be8646f7109 (diff)
Merge branch 'for-rmk' of git://git.kernel.org/pub/scm/linux/kernel/git/nico/orion into fixes
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cpuset.c52
-rw-r--r--kernel/futex.c176
-rw-r--r--kernel/hrtimer.c9
-rw-r--r--kernel/kgdb.c8
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched.c323
-rw-r--r--kernel/sched_clock.c236
-rw-r--r--kernel/sched_debug.c7
-rw-r--r--kernel/sched_fair.c50
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c9
-rw-r--r--kernel/semaphore.c64
-rw-r--r--kernel/time/clocksource.c4
15 files changed, 500 insertions, 488 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f52..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8da627d33804..86ea9e34e326 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1031,11 +1031,9 @@ int current_cpuset_is_being_rebound(void)
1031 return task_cs(current) == cpuset_being_rebound; 1031 return task_cs(current) == cpuset_being_rebound;
1032} 1032}
1033 1033
1034static int update_relax_domain_level(struct cpuset *cs, char *buf) 1034static int update_relax_domain_level(struct cpuset *cs, s64 val)
1035{ 1035{
1036 int val = simple_strtol(buf, NULL, 10); 1036 if ((int)val < 0)
1037
1038 if (val < 0)
1039 val = -1; 1037 val = -1;
1040 1038
1041 if (val != cs->relax_domain_level) { 1039 if (val != cs->relax_domain_level) {
@@ -1280,9 +1278,6 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1280 case FILE_MEMLIST: 1278 case FILE_MEMLIST:
1281 retval = update_nodemask(cs, buffer); 1279 retval = update_nodemask(cs, buffer);
1282 break; 1280 break;
1283 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1284 retval = update_relax_domain_level(cs, buffer);
1285 break;
1286 default: 1281 default:
1287 retval = -EINVAL; 1282 retval = -EINVAL;
1288 goto out2; 1283 goto out2;
@@ -1348,6 +1343,30 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1348 return retval; 1343 return retval;
1349} 1344}
1350 1345
1346static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1347{
1348 int retval = 0;
1349 struct cpuset *cs = cgroup_cs(cgrp);
1350 cpuset_filetype_t type = cft->private;
1351
1352 cgroup_lock();
1353
1354 if (cgroup_is_removed(cgrp)) {
1355 cgroup_unlock();
1356 return -ENODEV;
1357 }
1358 switch (type) {
1359 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1360 retval = update_relax_domain_level(cs, val);
1361 break;
1362 default:
1363 retval = -EINVAL;
1364 break;
1365 }
1366 cgroup_unlock();
1367 return retval;
1368}
1369
1351/* 1370/*
1352 * These ascii lists should be read in a single call, by using a user 1371 * These ascii lists should be read in a single call, by using a user
1353 * buffer large enough to hold the entire map. If read in smaller 1372 * buffer large enough to hold the entire map. If read in smaller
@@ -1406,9 +1425,6 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1406 case FILE_MEMLIST: 1425 case FILE_MEMLIST:
1407 s += cpuset_sprintf_memlist(s, cs); 1426 s += cpuset_sprintf_memlist(s, cs);
1408 break; 1427 break;
1409 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1410 s += sprintf(s, "%d", cs->relax_domain_level);
1411 break;
1412 default: 1428 default:
1413 retval = -EINVAL; 1429 retval = -EINVAL;
1414 goto out; 1430 goto out;
@@ -1449,6 +1465,18 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1449 } 1465 }
1450} 1466}
1451 1467
1468static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1469{
1470 struct cpuset *cs = cgroup_cs(cont);
1471 cpuset_filetype_t type = cft->private;
1472 switch (type) {
1473 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1474 return cs->relax_domain_level;
1475 default:
1476 BUG();
1477 }
1478}
1479
1452 1480
1453/* 1481/*
1454 * for the common functions, 'private' gives the type of file 1482 * for the common functions, 'private' gives the type of file
@@ -1499,8 +1527,8 @@ static struct cftype files[] = {
1499 1527
1500 { 1528 {
1501 .name = "sched_relax_domain_level", 1529 .name = "sched_relax_domain_level",
1502 .read_u64 = cpuset_read_u64, 1530 .read_s64 = cpuset_read_s64,
1503 .write_u64 = cpuset_write_u64, 1531 .write_s64 = cpuset_write_s64,
1504 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1532 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1505 }, 1533 },
1506 1534
diff --git a/kernel/futex.c b/kernel/futex.c
index 98092c9817f4..449def8074fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -104,10 +104,6 @@ struct futex_q {
104 /* Key which the futex is hashed on: */ 104 /* Key which the futex is hashed on: */
105 union futex_key key; 105 union futex_key key;
106 106
107 /* For fd, sigio sent using these: */
108 int fd;
109 struct file *filp;
110
111 /* Optional priority inheritance state: */ 107 /* Optional priority inheritance state: */
112 struct futex_pi_state *pi_state; 108 struct futex_pi_state *pi_state;
113 struct task_struct *task; 109 struct task_struct *task;
@@ -126,9 +122,6 @@ struct futex_hash_bucket {
126 122
127static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
128 124
129/* Futex-fs vfsmount entry: */
130static struct vfsmount *futex_mnt;
131
132/* 125/*
133 * Take mm->mmap_sem, when futex is shared 126 * Take mm->mmap_sem, when futex is shared
134 */ 127 */
@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
610static void wake_futex(struct futex_q *q) 603static void wake_futex(struct futex_q *q)
611{ 604{
612 plist_del(&q->list, &q->list.plist); 605 plist_del(&q->list, &q->list.plist);
613 if (q->filp)
614 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
615 /* 606 /*
616 * The lock in wake_up_all() is a crucial memory barrier after the 607 * The lock in wake_up_all() is a crucial memory barrier after the
617 * plist_del() and also before assigning to q->lock_ptr. 608 * plist_del() and also before assigning to q->lock_ptr.
@@ -988,14 +979,10 @@ out:
988} 979}
989 980
990/* The key must be already stored in q->key. */ 981/* The key must be already stored in q->key. */
991static inline struct futex_hash_bucket * 982static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
992queue_lock(struct futex_q *q, int fd, struct file *filp)
993{ 983{
994 struct futex_hash_bucket *hb; 984 struct futex_hash_bucket *hb;
995 985
996 q->fd = fd;
997 q->filp = filp;
998
999 init_waitqueue_head(&q->waiters); 986 init_waitqueue_head(&q->waiters);
1000 987
1001 get_futex_key_refs(&q->key); 988 get_futex_key_refs(&q->key);
@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
1006 return hb; 993 return hb;
1007} 994}
1008 995
1009static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 996static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1010{ 997{
1011 int prio; 998 int prio;
1012 999
@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1041 * exactly once. They are called with the hashed spinlock held. 1028 * exactly once. They are called with the hashed spinlock held.
1042 */ 1029 */
1043 1030
1044/* The key must be already stored in q->key. */
1045static void queue_me(struct futex_q *q, int fd, struct file *filp)
1046{
1047 struct futex_hash_bucket *hb;
1048
1049 hb = queue_lock(q, fd, filp);
1050 __queue_me(q, hb);
1051}
1052
1053/* Return 1 if we were still queued (ie. 0 means we were woken) */ 1031/* Return 1 if we were still queued (ie. 0 means we were woken) */
1054static int unqueue_me(struct futex_q *q) 1032static int unqueue_me(struct futex_q *q)
1055{ 1033{
@@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1194 if (unlikely(ret != 0)) 1172 if (unlikely(ret != 0))
1195 goto out_release_sem; 1173 goto out_release_sem;
1196 1174
1197 hb = queue_lock(&q, -1, NULL); 1175 hb = queue_lock(&q);
1198 1176
1199 /* 1177 /*
1200 * Access the page AFTER the futex is queued. 1178 * Access the page AFTER the futex is queued.
@@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1238 goto out_unlock_release_sem; 1216 goto out_unlock_release_sem;
1239 1217
1240 /* Only actually queue if *uaddr contained val. */ 1218 /* Only actually queue if *uaddr contained val. */
1241 __queue_me(&q, hb); 1219 queue_me(&q, hb);
1242 1220
1243 /* 1221 /*
1244 * Now the futex is queued and we have checked the data, we 1222 * Now the futex is queued and we have checked the data, we
@@ -1386,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1386 goto out_release_sem; 1364 goto out_release_sem;
1387 1365
1388 retry_unlocked: 1366 retry_unlocked:
1389 hb = queue_lock(&q, -1, NULL); 1367 hb = queue_lock(&q);
1390 1368
1391 retry_locked: 1369 retry_locked:
1392 ret = lock_taken = 0; 1370 ret = lock_taken = 0;
@@ -1499,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1499 /* 1477 /*
1500 * Only actually queue now that the atomic ops are done: 1478 * Only actually queue now that the atomic ops are done:
1501 */ 1479 */
1502 __queue_me(&q, hb); 1480 queue_me(&q, hb);
1503 1481
1504 /* 1482 /*
1505 * Now the futex is queued and we have checked the data, we 1483 * Now the futex is queued and we have checked the data, we
@@ -1746,121 +1724,6 @@ pi_faulted:
1746 return ret; 1724 return ret;
1747} 1725}
1748 1726
1749static int futex_close(struct inode *inode, struct file *filp)
1750{
1751 struct futex_q *q = filp->private_data;
1752
1753 unqueue_me(q);
1754 kfree(q);
1755
1756 return 0;
1757}
1758
1759/* This is one-shot: once it's gone off you need a new fd */
1760static unsigned int futex_poll(struct file *filp,
1761 struct poll_table_struct *wait)
1762{
1763 struct futex_q *q = filp->private_data;
1764 int ret = 0;
1765
1766 poll_wait(filp, &q->waiters, wait);
1767
1768 /*
1769 * plist_node_empty() is safe here without any lock.
1770 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
1771 */
1772 if (plist_node_empty(&q->list))
1773 ret = POLLIN | POLLRDNORM;
1774
1775 return ret;
1776}
1777
1778static const struct file_operations futex_fops = {
1779 .release = futex_close,
1780 .poll = futex_poll,
1781};
1782
1783/*
1784 * Signal allows caller to avoid the race which would occur if they
1785 * set the sigio stuff up afterwards.
1786 */
1787static int futex_fd(u32 __user *uaddr, int signal)
1788{
1789 struct futex_q *q;
1790 struct file *filp;
1791 int ret, err;
1792 struct rw_semaphore *fshared;
1793 static unsigned long printk_interval;
1794
1795 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1796 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1797 "will be removed from the kernel in June 2007\n",
1798 current->comm);
1799 }
1800
1801 ret = -EINVAL;
1802 if (!valid_signal(signal))
1803 goto out;
1804
1805 ret = get_unused_fd();
1806 if (ret < 0)
1807 goto out;
1808 filp = get_empty_filp();
1809 if (!filp) {
1810 put_unused_fd(ret);
1811 ret = -ENFILE;
1812 goto out;
1813 }
1814 filp->f_op = &futex_fops;
1815 filp->f_path.mnt = mntget(futex_mnt);
1816 filp->f_path.dentry = dget(futex_mnt->mnt_root);
1817 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1818
1819 if (signal) {
1820 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
1821 if (err < 0) {
1822 goto error;
1823 }
1824 filp->f_owner.signum = signal;
1825 }
1826
1827 q = kmalloc(sizeof(*q), GFP_KERNEL);
1828 if (!q) {
1829 err = -ENOMEM;
1830 goto error;
1831 }
1832 q->pi_state = NULL;
1833
1834 fshared = &current->mm->mmap_sem;
1835 down_read(fshared);
1836 err = get_futex_key(uaddr, fshared, &q->key);
1837
1838 if (unlikely(err != 0)) {
1839 up_read(fshared);
1840 kfree(q);
1841 goto error;
1842 }
1843
1844 /*
1845 * queue_me() must be called before releasing mmap_sem, because
1846 * key->shared.inode needs to be referenced while holding it.
1847 */
1848 filp->private_data = q;
1849
1850 queue_me(q, ret, filp);
1851 up_read(fshared);
1852
1853 /* Now we map fd to filp, so userspace can access it */
1854 fd_install(ret, filp);
1855out:
1856 return ret;
1857error:
1858 put_unused_fd(ret);
1859 put_filp(filp);
1860 ret = err;
1861 goto out;
1862}
1863
1864/* 1727/*
1865 * Support for robust futexes: the kernel cleans up held futexes at 1728 * Support for robust futexes: the kernel cleans up held futexes at
1866 * thread exit time. 1729 * thread exit time.
@@ -2092,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2092 case FUTEX_WAKE_BITSET: 1955 case FUTEX_WAKE_BITSET:
2093 ret = futex_wake(uaddr, fshared, val, val3); 1956 ret = futex_wake(uaddr, fshared, val, val3);
2094 break; 1957 break;
2095 case FUTEX_FD:
2096 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
2097 ret = futex_fd(uaddr, val);
2098 break;
2099 case FUTEX_REQUEUE: 1958 case FUTEX_REQUEUE:
2100 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 1959 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
2101 break; 1960 break;
@@ -2156,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2156 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2015 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2157} 2016}
2158 2017
2159static int futexfs_get_sb(struct file_system_type *fs_type,
2160 int flags, const char *dev_name, void *data,
2161 struct vfsmount *mnt)
2162{
2163 return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
2164}
2165
2166static struct file_system_type futex_fs_type = {
2167 .name = "futexfs",
2168 .get_sb = futexfs_get_sb,
2169 .kill_sb = kill_anon_super,
2170};
2171
2172static int __init futex_init(void) 2018static int __init futex_init(void)
2173{ 2019{
2174 u32 curval; 2020 u32 curval;
@@ -2193,16 +2039,6 @@ static int __init futex_init(void)
2193 spin_lock_init(&futex_queues[i].lock); 2039 spin_lock_init(&futex_queues[i].lock);
2194 } 2040 }
2195 2041
2196 i = register_filesystem(&futex_fs_type);
2197 if (i)
2198 return i;
2199
2200 futex_mnt = kern_mount(&futex_fs_type);
2201 if (IS_ERR(futex_mnt)) {
2202 unregister_filesystem(&futex_fs_type);
2203 return PTR_ERR(futex_mnt);
2204 }
2205
2206 return 0; 2042 return 0;
2207} 2043}
2208__initcall(futex_init); 2044__initcall(futex_init);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9af1d6a8095e..421be5fe5cc7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -154,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
154} 154}
155 155
156/* 156/*
157 * Helper function to check, whether the timer is running the callback
158 * function
159 */
160static inline int hrtimer_callback_running(struct hrtimer *timer)
161{
162 return timer->state & HRTIMER_STATE_CALLBACK;
163}
164
165/*
166 * Functions and macros which are different for UP/SMP systems are kept in a 157 * Functions and macros which are different for UP/SMP systems are kept in a
167 * single place 158 * single place
168 */ 159 */
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 1bd0ec1c80b2..39e31a036f5b 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -61,7 +61,7 @@ struct kgdb_state {
61 int err_code; 61 int err_code;
62 int cpu; 62 int cpu;
63 int pass_exception; 63 int pass_exception;
64 long threadid; 64 unsigned long threadid;
65 long kgdb_usethreadid; 65 long kgdb_usethreadid;
66 struct pt_regs *linux_regs; 66 struct pt_regs *linux_regs;
67}; 67};
@@ -146,7 +146,7 @@ atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
146 * the other CPUs might interfere with your debugging context, so 146 * the other CPUs might interfere with your debugging context, so
147 * use this with care: 147 * use this with care:
148 */ 148 */
149int kgdb_do_roundup = 1; 149static int kgdb_do_roundup = 1;
150 150
151static int __init opt_nokgdbroundup(char *str) 151static int __init opt_nokgdbroundup(char *str)
152{ 152{
@@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
438 * While we find nice hex chars, build a long_val. 438 * While we find nice hex chars, build a long_val.
439 * Return number of chars processed. 439 * Return number of chars processed.
440 */ 440 */
441int kgdb_hex2long(char **ptr, long *long_val) 441int kgdb_hex2long(char **ptr, unsigned long *long_val)
442{ 442{
443 int hex_val; 443 int hex_val;
444 int num = 0; 444 int num = 0;
@@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr)
709 return 0; 709 return 0;
710} 710}
711 711
712int remove_all_break(void) 712static int remove_all_break(void)
713{ 713{
714 unsigned long addr; 714 unsigned long addr;
715 int error; 715 int error;
diff --git a/kernel/module.c b/kernel/module.c
index 8674a390a2e8..8e4528c9909f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
890 890
891static const char vermagic[] = VERMAGIC_STRING; 891static const char vermagic[] = VERMAGIC_STRING;
892 892
893static int try_to_force_load(struct module *mod, const char *symname)
894{
895#ifdef CONFIG_MODULE_FORCE_LOAD
896 if (!(tainted & TAINT_FORCED_MODULE))
897 printk("%s: no version for \"%s\" found: kernel tainted.\n",
898 mod->name, symname);
899 add_taint_module(mod, TAINT_FORCED_MODULE);
900 return 0;
901#else
902 return -ENOEXEC;
903#endif
904}
905
893#ifdef CONFIG_MODVERSIONS 906#ifdef CONFIG_MODVERSIONS
894static int check_version(Elf_Shdr *sechdrs, 907static int check_version(Elf_Shdr *sechdrs,
895 unsigned int versindex, 908 unsigned int versindex,
@@ -914,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
914 927
915 if (versions[i].crc == *crc) 928 if (versions[i].crc == *crc)
916 return 1; 929 return 1;
917 printk("%s: disagrees about version of symbol %s\n",
918 mod->name, symname);
919 DEBUGP("Found checksum %lX vs module %lX\n", 930 DEBUGP("Found checksum %lX vs module %lX\n",
920 *crc, versions[i].crc); 931 *crc, versions[i].crc);
921 return 0; 932 goto bad_version;
922 } 933 }
923 /* Not in module's version table. OK, but that taints the kernel. */ 934
924 if (!(tainted & TAINT_FORCED_MODULE)) 935 if (!try_to_force_load(mod, symname))
925 printk("%s: no version for \"%s\" found: kernel tainted.\n", 936 return 1;
926 mod->name, symname); 937
927 add_taint_module(mod, TAINT_FORCED_MODULE); 938bad_version:
928 return 1; 939 printk("%s: disagrees about version of symbol %s\n",
940 mod->name, symname);
941 return 0;
929} 942}
930 943
931static inline int check_modstruct_version(Elf_Shdr *sechdrs, 944static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -1853,9 +1866,9 @@ static struct module *load_module(void __user *umod,
1853 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1866 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1854 /* This is allowed: modprobe --force will invalidate it. */ 1867 /* This is allowed: modprobe --force will invalidate it. */
1855 if (!modmagic) { 1868 if (!modmagic) {
1856 add_taint_module(mod, TAINT_FORCED_MODULE); 1869 err = try_to_force_load(mod, "magic");
1857 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1870 if (err)
1858 mod->name); 1871 goto free_hdr;
1859 } else if (!same_magic(modmagic, vermagic)) { 1872 } else if (!same_magic(modmagic, vermagic)) {
1860 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 1873 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
1861 mod->name, modmagic, vermagic); 1874 mod->name, modmagic, vermagic);
@@ -2006,9 +2019,10 @@ static struct module *load_module(void __user *umod,
2006 (mod->num_gpl_future_syms && !gplfuturecrcindex) || 2019 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
2007 (mod->num_unused_syms && !unusedcrcindex) || 2020 (mod->num_unused_syms && !unusedcrcindex) ||
2008 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { 2021 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
2009 printk(KERN_WARNING "%s: No versions for exported symbols." 2022 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
2010 " Tainting kernel.\n", mod->name); 2023 err = try_to_force_load(mod, "nocrc");
2011 add_taint_module(mod, TAINT_FORCED_MODULE); 2024 if (err)
2025 goto cleanup;
2012 } 2026 }
2013#endif 2027#endif
2014 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); 2028 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..bc24dcdc570f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
1191 ret = 0; 1191 ret = 0;
1192 spliced = 0; 1192 spliced = 0;
1193 1193
1194 while (len && !spliced) { 1194 while (len) {
1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1196 if (ret < 0) 1196 if (ret < 0)
1197 break; 1197 break;
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5bc120e..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77/* 77/*
78 * Scheduler clock - returns current time in nanosec units.
79 * This is default implementation.
80 * Architectures and sub-architectures can override this.
81 */
82unsigned long long __attribute__((weak)) sched_clock(void)
83{
84 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85}
86
87/*
88 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
89 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90 * and back. 80 * and back.
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
242} 232}
243#endif 233#endif
244 234
235/*
236 * sched_domains_mutex serializes calls to arch_init_sched_domains,
237 * detach_destroy_domains and partition_sched_domains.
238 */
239static DEFINE_MUTEX(sched_domains_mutex);
240
245#ifdef CONFIG_GROUP_SCHED 241#ifdef CONFIG_GROUP_SCHED
246 242
247#include <linux/cgroup.h> 243#include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
308 */ 304 */
309static DEFINE_SPINLOCK(task_group_lock); 305static DEFINE_SPINLOCK(task_group_lock);
310 306
311/* doms_cur_mutex serializes access to doms_cur[] array */
312static DEFINE_MUTEX(doms_cur_mutex);
313
314#ifdef CONFIG_FAIR_GROUP_SCHED 307#ifdef CONFIG_FAIR_GROUP_SCHED
315#ifdef CONFIG_USER_SCHED 308#ifdef CONFIG_USER_SCHED
316# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
319#endif 312#endif
320 313
314/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
316 * (The default weight is 1024 - so there's no practical
317 * limitation from this.)
318 */
321#define MIN_SHARES 2 319#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1)
322 321
323static int init_task_group_load = INIT_TASK_GROUP_LOAD; 322static int init_task_group_load = INIT_TASK_GROUP_LOAD;
324#endif 323#endif
@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
358#endif 357#endif
359} 358}
360 359
361static inline void lock_doms_cur(void)
362{
363 mutex_lock(&doms_cur_mutex);
364}
365
366static inline void unlock_doms_cur(void)
367{
368 mutex_unlock(&doms_cur_mutex);
369}
370
371#else 360#else
372 361
373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline void lock_doms_cur(void) { }
375static inline void unlock_doms_cur(void) { }
376 363
377#endif /* CONFIG_GROUP_SCHED */ 364#endif /* CONFIG_GROUP_SCHED */
378 365
@@ -560,13 +547,7 @@ struct rq {
560 unsigned long next_balance; 547 unsigned long next_balance;
561 struct mm_struct *prev_mm; 548 struct mm_struct *prev_mm;
562 549
563 u64 clock, prev_clock_raw; 550 u64 clock;
564 s64 clock_max_delta;
565
566 unsigned int clock_warps, clock_overflows, clock_underflows;
567 u64 idle_clock;
568 unsigned int clock_deep_idle_events;
569 u64 tick_timestamp;
570 551
571 atomic_t nr_iowait; 552 atomic_t nr_iowait;
572 553
@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
631#endif 612#endif
632} 613}
633 614
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
660/*
661 * Update the per-runqueue clock, as finegrained as the platform can give
662 * us, but without assuming monotonicity, etc.:
663 */
664static void __update_rq_clock(struct rq *rq)
665{
666 u64 prev_raw = rq->prev_clock_raw;
667 u64 now = sched_clock();
668 s64 delta = now - prev_raw;
669 u64 clock = rq->clock;
670
671#ifdef CONFIG_SCHED_DEBUG
672 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
673#endif
674 /*
675 * Protect against sched_clock() occasionally going backwards:
676 */
677 if (unlikely(delta < 0)) {
678 clock++;
679 rq->clock_warps++;
680 } else {
681 /*
682 * Catch too large forward jumps too:
683 */
684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
685 u64 max_time = rq->tick_timestamp + max_jump;
686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
690 else
691 clock++;
692 rq->clock_overflows++;
693 } else {
694 if (unlikely(delta > rq->clock_max_delta))
695 rq->clock_max_delta = delta;
696 clock += delta;
697 }
698 }
699
700 rq->prev_clock_raw = now;
701 rq->clock = clock;
702}
703
704static void update_rq_clock(struct rq *rq)
705{
706 if (likely(smp_processor_id() == cpu_of(rq)))
707 __update_rq_clock(rq);
708}
709
710/* 615/*
711 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 616 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
712 * See detach_destroy_domains: synchronize_sched for details. 617 * See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
722#define task_rq(p) cpu_rq(task_cpu(p)) 627#define task_rq(p) cpu_rq(task_cpu(p))
723#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 628#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
724 629
630static inline void update_rq_clock(struct rq *rq)
631{
632 rq->clock = sched_clock_cpu(cpu_of(rq));
633}
634
725/* 635/*
726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 636 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
727 */ 637 */
@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
757#define SCHED_FEAT(name, enabled) \ 667#define SCHED_FEAT(name, enabled) \
758 #name , 668 #name ,
759 669
760__read_mostly char *sched_feat_names[] = { 670static __read_mostly char *sched_feat_names[] = {
761#include "sched_features.h" 671#include "sched_features.h"
762 NULL 672 NULL
763}; 673};
764 674
765#undef SCHED_FEAT 675#undef SCHED_FEAT
766 676
767int sched_feat_open(struct inode *inode, struct file *filp) 677static int sched_feat_open(struct inode *inode, struct file *filp)
768{ 678{
769 filp->private_data = inode->i_private; 679 filp->private_data = inode->i_private;
770 return 0; 680 return 0;
@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900} 810}
901 811
902static const unsigned long long time_sync_thresh = 100000; 812unsigned long long time_sync_thresh = 100000;
903 813
904static DEFINE_PER_CPU(unsigned long long, time_offset); 814static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); 815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
913static DEFINE_SPINLOCK(time_sync_lock); 823static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time; 824static unsigned long long prev_global_time;
915 825
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) 826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
917{ 827{
918 unsigned long flags; 828 /*
919 829 * We want this inlined, to not get tracer function calls
920 spin_lock_irqsave(&time_sync_lock, flags); 830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
921 834
922 if (time < prev_global_time) { 835 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time; 836 per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
926 prev_global_time = time; 839 prev_global_time = time;
927 } 840 }
928 841
929 spin_unlock_irqrestore(&time_sync_lock, flags); 842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
930 844
931 return time; 845 return time;
932} 846}
@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
934static unsigned long long __cpu_clock(int cpu) 848static unsigned long long __cpu_clock(int cpu)
935{ 849{
936 unsigned long long now; 850 unsigned long long now;
937 unsigned long flags;
938 struct rq *rq;
939 851
940 /* 852 /*
941 * Only call sched_clock() if the scheduler has already been 853 * Only call sched_clock() if the scheduler has already been
@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944 if (unlikely(!scheduler_running)) 856 if (unlikely(!scheduler_running))
945 return 0; 857 return 0;
946 858
947 local_irq_save(flags); 859 now = sched_clock_cpu(cpu);
948 rq = cpu_rq(cpu);
949 update_rq_clock(rq);
950 now = rq->clock;
951 local_irq_restore(flags);
952 860
953 return now; 861 return now;
954} 862}
@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
960unsigned long long cpu_clock(int cpu) 868unsigned long long cpu_clock(int cpu)
961{ 869{
962 unsigned long long prev_cpu_time, time, delta_time; 870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
963 872
873 local_irq_save(flags);
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu); 874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); 875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time; 876 delta_time = time-prev_cpu_time;
967 877
968 if (unlikely(delta_time > time_sync_thresh)) 878 if (unlikely(delta_time > time_sync_thresh)) {
969 time = __sync_cpu_clock(time, cpu); 879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
970 883
971 return time; 884 return time;
972} 885}
@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
1117 return rq; 1030 return rq;
1118} 1031}
1119 1032
1120/*
1121 * We are going deep-idle (irqs are disabled):
1122 */
1123void sched_clock_idle_sleep_event(void)
1124{
1125 struct rq *rq = cpu_rq(smp_processor_id());
1126
1127 spin_lock(&rq->lock);
1128 __update_rq_clock(rq);
1129 spin_unlock(&rq->lock);
1130 rq->clock_deep_idle_events++;
1131}
1132EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1133
1134/*
1135 * We just idled delta nanoseconds (called with irqs disabled):
1136 */
1137void sched_clock_idle_wakeup_event(u64 delta_ns)
1138{
1139 struct rq *rq = cpu_rq(smp_processor_id());
1140 u64 now = sched_clock();
1141
1142 rq->idle_clock += delta_ns;
1143 /*
1144 * Override the previous timestamp and ignore all
1145 * sched_clock() deltas that occured while we idled,
1146 * and use the PM-provided delta_ns to advance the
1147 * rq clock:
1148 */
1149 spin_lock(&rq->lock);
1150 rq->prev_clock_raw = now;
1151 rq->clock += delta_ns;
1152 spin_unlock(&rq->lock);
1153 touch_softlockup_watchdog();
1154}
1155EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1156
1157static void __resched_task(struct task_struct *p, int tif_bit); 1033static void __resched_task(struct task_struct *p, int tif_bit);
1158 1034
1159static inline void resched_task(struct task_struct *p) 1035static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
1189enum { 1065enum {
1190 HRTICK_SET, /* re-programm hrtick_timer */ 1066 HRTICK_SET, /* re-programm hrtick_timer */
1191 HRTICK_RESET, /* not a new slice */ 1067 HRTICK_RESET, /* not a new slice */
1068 HRTICK_BLOCK, /* stop hrtick operations */
1192}; 1069};
1193 1070
1194/* 1071/*
@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
1200{ 1077{
1201 if (!sched_feat(HRTICK)) 1078 if (!sched_feat(HRTICK))
1202 return 0; 1079 return 0;
1080 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1081 return 0;
1203 return hrtimer_is_hres_active(&rq->hrtick_timer); 1082 return hrtimer_is_hres_active(&rq->hrtick_timer);
1204} 1083}
1205 1084
@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1275 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1154 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1276 1155
1277 spin_lock(&rq->lock); 1156 spin_lock(&rq->lock);
1278 __update_rq_clock(rq); 1157 update_rq_clock(rq);
1279 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1158 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1280 spin_unlock(&rq->lock); 1159 spin_unlock(&rq->lock);
1281 1160
1282 return HRTIMER_NORESTART; 1161 return HRTIMER_NORESTART;
1283} 1162}
1284 1163
1285static inline void init_rq_hrtick(struct rq *rq) 1164static void hotplug_hrtick_disable(int cpu)
1165{
1166 struct rq *rq = cpu_rq(cpu);
1167 unsigned long flags;
1168
1169 spin_lock_irqsave(&rq->lock, flags);
1170 rq->hrtick_flags = 0;
1171 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1172 spin_unlock_irqrestore(&rq->lock, flags);
1173
1174 hrtick_clear(rq);
1175}
1176
1177static void hotplug_hrtick_enable(int cpu)
1178{
1179 struct rq *rq = cpu_rq(cpu);
1180 unsigned long flags;
1181
1182 spin_lock_irqsave(&rq->lock, flags);
1183 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1184 spin_unlock_irqrestore(&rq->lock, flags);
1185}
1186
1187static int
1188hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1189{
1190 int cpu = (int)(long)hcpu;
1191
1192 switch (action) {
1193 case CPU_UP_CANCELED:
1194 case CPU_UP_CANCELED_FROZEN:
1195 case CPU_DOWN_PREPARE:
1196 case CPU_DOWN_PREPARE_FROZEN:
1197 case CPU_DEAD:
1198 case CPU_DEAD_FROZEN:
1199 hotplug_hrtick_disable(cpu);
1200 return NOTIFY_OK;
1201
1202 case CPU_UP_PREPARE:
1203 case CPU_UP_PREPARE_FROZEN:
1204 case CPU_DOWN_FAILED:
1205 case CPU_DOWN_FAILED_FROZEN:
1206 case CPU_ONLINE:
1207 case CPU_ONLINE_FROZEN:
1208 hotplug_hrtick_enable(cpu);
1209 return NOTIFY_OK;
1210 }
1211
1212 return NOTIFY_DONE;
1213}
1214
1215static void init_hrtick(void)
1216{
1217 hotcpu_notifier(hotplug_hrtick, 0);
1218}
1219
1220static void init_rq_hrtick(struct rq *rq)
1286{ 1221{
1287 rq->hrtick_flags = 0; 1222 rq->hrtick_flags = 0;
1288 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1223 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
1319void hrtick_resched(void) 1254void hrtick_resched(void)
1320{ 1255{
1321} 1256}
1257
1258static inline void init_hrtick(void)
1259{
1260}
1322#endif 1261#endif
1323 1262
1324/* 1263/*
@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1438{ 1377{
1439 u64 tmp; 1378 u64 tmp;
1440 1379
1441 if (unlikely(!lw->inv_weight)) 1380 if (!lw->inv_weight)
1442 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); 1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
1443 1382
1444 tmp = (u64)delta_exec * weight; 1383 tmp = (u64)delta_exec * weight;
1445 /* 1384 /*
@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1748 1687
1749 if (shares < MIN_SHARES) 1688 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES; 1689 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES;
1751 1692
1752 __set_se_shares(tg->se[tcpu], shares); 1693 __set_se_shares(tg->se[tcpu], shares);
1753} 1694}
@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4339 struct rq *rq = this_rq(); 4280 struct rq *rq = this_rq();
4340 cputime64_t tmp; 4281 cputime64_t tmp;
4341 4282
4342 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) 4283 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4343 return account_guest_time(p, cputime); 4284 account_guest_time(p, cputime);
4285 return;
4286 }
4344 4287
4345 p->stime = cputime_add(p->stime, cputime); 4288 p->stime = cputime_add(p->stime, cputime);
4346 4289
@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
4404 int cpu = smp_processor_id(); 4347 int cpu = smp_processor_id();
4405 struct rq *rq = cpu_rq(cpu); 4348 struct rq *rq = cpu_rq(cpu);
4406 struct task_struct *curr = rq->curr; 4349 struct task_struct *curr = rq->curr;
4407 u64 next_tick = rq->tick_timestamp + TICK_NSEC; 4350
4351 sched_clock_tick();
4408 4352
4409 spin_lock(&rq->lock); 4353 spin_lock(&rq->lock);
4410 __update_rq_clock(rq); 4354 update_rq_clock(rq);
4411 /*
4412 * Let rq->clock advance by at least TICK_NSEC:
4413 */
4414 if (unlikely(rq->clock < next_tick)) {
4415 rq->clock = next_tick;
4416 rq->clock_underflows++;
4417 }
4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
4420 update_cpu_load(rq); 4355 update_cpu_load(rq);
4421 curr->sched_class->task_tick(rq, curr, 0); 4356 curr->sched_class->task_tick(rq, curr, 0);
4422 spin_unlock(&rq->lock); 4357 spin_unlock(&rq->lock);
@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
4570 * Do the rq-clock update outside the rq lock: 4505 * Do the rq-clock update outside the rq lock:
4571 */ 4506 */
4572 local_irq_disable(); 4507 local_irq_disable();
4573 __update_rq_clock(rq); 4508 update_rq_clock(rq);
4574 spin_lock(&rq->lock); 4509 spin_lock(&rq->lock);
4575 clear_tsk_need_resched(prev); 4510 clear_tsk_need_resched(prev);
4576 4511
@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
4595 prev->sched_class->put_prev_task(rq, prev); 4530 prev->sched_class->put_prev_task(rq, prev);
4596 next = pick_next_task(rq, prev); 4531 next = pick_next_task(rq, prev);
4597 4532
4598 sched_info_switch(prev, next);
4599
4600 if (likely(prev != next)) { 4533 if (likely(prev != next)) {
4534 sched_info_switch(prev, next);
4535
4601 rq->nr_switches++; 4536 rq->nr_switches++;
4602 rq->curr = next; 4537 rq->curr = next;
4603 ++*switch_count; 4538 ++*switch_count;
@@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7755{ 7690{
7756 int i, j; 7691 int i, j;
7757 7692
7758 lock_doms_cur(); 7693 mutex_lock(&sched_domains_mutex);
7759 7694
7760 /* always unregister in case we don't destroy any domains */ 7695 /* always unregister in case we don't destroy any domains */
7761 unregister_sched_domain_sysctl(); 7696 unregister_sched_domain_sysctl();
@@ -7804,7 +7739,7 @@ match2:
7804 7739
7805 register_sched_domain_sysctl(); 7740 register_sched_domain_sysctl();
7806 7741
7807 unlock_doms_cur(); 7742 mutex_unlock(&sched_domains_mutex);
7808} 7743}
7809 7744
7810#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7745#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void)
7813 int err; 7748 int err;
7814 7749
7815 get_online_cpus(); 7750 get_online_cpus();
7751 mutex_lock(&sched_domains_mutex);
7816 detach_destroy_domains(&cpu_online_map); 7752 detach_destroy_domains(&cpu_online_map);
7817 err = arch_init_sched_domains(&cpu_online_map); 7753 err = arch_init_sched_domains(&cpu_online_map);
7754 mutex_unlock(&sched_domains_mutex);
7818 put_online_cpus(); 7755 put_online_cpus();
7819 7756
7820 return err; 7757 return err;
@@ -7932,13 +7869,16 @@ void __init sched_init_smp(void)
7932 BUG_ON(sched_group_nodes_bycpu == NULL); 7869 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif 7870#endif
7934 get_online_cpus(); 7871 get_online_cpus();
7872 mutex_lock(&sched_domains_mutex);
7935 arch_init_sched_domains(&cpu_online_map); 7873 arch_init_sched_domains(&cpu_online_map);
7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7874 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7937 if (cpus_empty(non_isolated_cpus)) 7875 if (cpus_empty(non_isolated_cpus))
7938 cpu_set(smp_processor_id(), non_isolated_cpus); 7876 cpu_set(smp_processor_id(), non_isolated_cpus);
7877 mutex_unlock(&sched_domains_mutex);
7939 put_online_cpus(); 7878 put_online_cpus();
7940 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7879 /* XXX: Theoretical race here - CPU may be hotplugged now */
7941 hotcpu_notifier(update_sched_domains, 0); 7880 hotcpu_notifier(update_sched_domains, 0);
7881 init_hrtick();
7942 7882
7943 /* Move init over to a non-isolated CPU */ 7883 /* Move init over to a non-isolated CPU */
7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 7884 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8025 7965
8026 se->my_q = cfs_rq; 7966 se->my_q = cfs_rq;
8027 se->load.weight = tg->shares; 7967 se->load.weight = tg->shares;
8028 se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); 7968 se->load.inv_weight = 0;
8029 se->parent = parent; 7969 se->parent = parent;
8030} 7970}
8031#endif 7971#endif
@@ -8149,8 +8089,6 @@ void __init sched_init(void)
8149 spin_lock_init(&rq->lock); 8089 spin_lock_init(&rq->lock);
8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8090 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8151 rq->nr_running = 0; 8091 rq->nr_running = 0;
8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
8154 init_cfs_rq(&rq->cfs, rq); 8092 init_cfs_rq(&rq->cfs, rq);
8155 init_rt_rq(&rq->rt, rq); 8093 init_rt_rq(&rq->rt, rq);
8156#ifdef CONFIG_FAIR_GROUP_SCHED 8094#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
8294static void normalize_task(struct rq *rq, struct task_struct *p) 8232static void normalize_task(struct rq *rq, struct task_struct *p)
8295{ 8233{
8296 int on_rq; 8234 int on_rq;
8235
8297 update_rq_clock(rq); 8236 update_rq_clock(rq);
8298 on_rq = p->se.on_rq; 8237 on_rq = p->se.on_rq;
8299 if (on_rq) 8238 if (on_rq)
@@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void)
8325 p->se.sleep_start = 0; 8264 p->se.sleep_start = 0;
8326 p->se.block_start = 0; 8265 p->se.block_start = 0;
8327#endif 8266#endif
8328 task_rq(p)->clock = 0;
8329 8267
8330 if (!rt_task(p)) { 8268 if (!rt_task(p)) {
8331 /* 8269 /*
@@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8692 dequeue_entity(cfs_rq, se, 0); 8630 dequeue_entity(cfs_rq, se, 0);
8693 8631
8694 se->load.weight = shares; 8632 se->load.weight = shares;
8695 se->load.inv_weight = div64_u64((1ULL<<32), shares); 8633 se->load.inv_weight = 0;
8696 8634
8697 if (on_rq) 8635 if (on_rq)
8698 enqueue_entity(cfs_rq, se, 0); 8636 enqueue_entity(cfs_rq, se, 0);
@@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8722 if (!tg->se[0]) 8660 if (!tg->se[0])
8723 return -EINVAL; 8661 return -EINVAL;
8724 8662
8725 /*
8726 * A weight of 0 or 1 can cause arithmetics problems.
8727 * (The default weight is 1024 - so there's no practical
8728 * limitation from this.)
8729 */
8730 if (shares < MIN_SHARES) 8663 if (shares < MIN_SHARES)
8731 shares = MIN_SHARES; 8664 shares = MIN_SHARES;
8665 else if (shares > MAX_SHARES)
8666 shares = MAX_SHARES;
8732 8667
8733 mutex_lock(&shares_mutex); 8668 mutex_lock(&shares_mutex);
8734 if (tg->shares == shares) 8669 if (tg->shares == shares)
@@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8753 * force a rebalance 8688 * force a rebalance
8754 */ 8689 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8690 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids); 8691 set_se_shares(tg->se[i], shares);
8757 } 8692 }
8758 8693
8759 /* 8694 /*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com>
9 *
10 * Create a semi stable clock from a mixture of other events, including:
11 * - gtod
12 * - jiffies
13 * - sched_clock()
14 * - explicit idle events
15 *
16 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
17 * making it monotonic and keeping it within an expected window. This window
18 * is set up using jiffies.
19 *
20 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
21 * that is otherwise invisible (TSC gets stopped).
22 *
23 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
24 * consistent between cpus (never more than 1 jiffies difference).
25 */
26#include <linux/sched.h>
27#include <linux/percpu.h>
28#include <linux/spinlock.h>
29#include <linux/ktime.h>
30#include <linux/module.h>
31
32
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34
35struct sched_clock_data {
36 /*
37 * Raw spinlock - this is a special case: this might be called
38 * from within instrumentation code so we dont want to do any
39 * instrumentation ourselves.
40 */
41 raw_spinlock_t lock;
42
43 unsigned long prev_jiffies;
44 u64 prev_raw;
45 u64 tick_raw;
46 u64 tick_gtod;
47 u64 clock;
48};
49
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
51
52static inline struct sched_clock_data *this_scd(void)
53{
54 return &__get_cpu_var(sched_clock_data);
55}
56
57static inline struct sched_clock_data *cpu_sdc(int cpu)
58{
59 return &per_cpu(sched_clock_data, cpu);
60}
61
62void sched_clock_init(void)
63{
64 u64 ktime_now = ktime_to_ns(ktime_get());
65 u64 now = 0;
66 int cpu;
67
68 for_each_possible_cpu(cpu) {
69 struct sched_clock_data *scd = cpu_sdc(cpu);
70
71 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72 scd->prev_jiffies = jiffies;
73 scd->prev_raw = now;
74 scd->tick_raw = now;
75 scd->tick_gtod = ktime_now;
76 scd->clock = ktime_now;
77 }
78}
79
80/*
81 * update the percpu scd from the raw @now value
82 *
83 * - filter out backward motion
84 * - use jiffies to generate a min,max window to clip the raw values
85 */
86static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
87{
88 unsigned long now_jiffies = jiffies;
89 long delta_jiffies = now_jiffies - scd->prev_jiffies;
90 u64 clock = scd->clock;
91 u64 min_clock, max_clock;
92 s64 delta = now - scd->prev_raw;
93
94 WARN_ON_ONCE(!irqs_disabled());
95 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
96
97 if (unlikely(delta < 0)) {
98 clock++;
99 goto out;
100 }
101
102 max_clock = min_clock + TICK_NSEC;
103
104 if (unlikely(clock + delta > max_clock)) {
105 if (clock < max_clock)
106 clock = max_clock;
107 else
108 clock++;
109 } else {
110 clock += delta;
111 }
112
113 out:
114 if (unlikely(clock < min_clock))
115 clock = min_clock;
116
117 scd->prev_raw = now;
118 scd->prev_jiffies = now_jiffies;
119 scd->clock = clock;
120}
121
122static void lock_double_clock(struct sched_clock_data *data1,
123 struct sched_clock_data *data2)
124{
125 if (data1 < data2) {
126 __raw_spin_lock(&data1->lock);
127 __raw_spin_lock(&data2->lock);
128 } else {
129 __raw_spin_lock(&data2->lock);
130 __raw_spin_lock(&data1->lock);
131 }
132}
133
134u64 sched_clock_cpu(int cpu)
135{
136 struct sched_clock_data *scd = cpu_sdc(cpu);
137 u64 now, clock;
138
139 WARN_ON_ONCE(!irqs_disabled());
140 now = sched_clock();
141
142 if (cpu != raw_smp_processor_id()) {
143 /*
144 * in order to update a remote cpu's clock based on our
145 * unstable raw time rebase it against:
146 * tick_raw (offset between raw counters)
147 * tick_gotd (tick offset between cpus)
148 */
149 struct sched_clock_data *my_scd = this_scd();
150
151 lock_double_clock(scd, my_scd);
152
153 now -= my_scd->tick_raw;
154 now += scd->tick_raw;
155
156 now -= my_scd->tick_gtod;
157 now += scd->tick_gtod;
158
159 __raw_spin_unlock(&my_scd->lock);
160 } else {
161 __raw_spin_lock(&scd->lock);
162 }
163
164 __update_sched_clock(scd, now);
165 clock = scd->clock;
166
167 __raw_spin_unlock(&scd->lock);
168
169 return clock;
170}
171
172void sched_clock_tick(void)
173{
174 struct sched_clock_data *scd = this_scd();
175 u64 now, now_gtod;
176
177 WARN_ON_ONCE(!irqs_disabled());
178
179 now = sched_clock();
180 now_gtod = ktime_to_ns(ktime_get());
181
182 __raw_spin_lock(&scd->lock);
183 __update_sched_clock(scd, now);
184 /*
185 * update tick_gtod after __update_sched_clock() because that will
186 * already observe 1 new jiffy; adding a new tick_gtod to that would
187 * increase the clock 2 jiffies.
188 */
189 scd->tick_raw = now;
190 scd->tick_gtod = now_gtod;
191 __raw_spin_unlock(&scd->lock);
192}
193
194/*
195 * We are going deep-idle (irqs are disabled):
196 */
197void sched_clock_idle_sleep_event(void)
198{
199 sched_clock_cpu(smp_processor_id());
200}
201EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
202
203/*
204 * We just idled delta nanoseconds (called with irqs disabled):
205 */
206void sched_clock_idle_wakeup_event(u64 delta_ns)
207{
208 struct sched_clock_data *scd = this_scd();
209 u64 now = sched_clock();
210
211 /*
212 * Override the previous timestamp and ignore all
213 * sched_clock() deltas that occured while we idled,
214 * and use the PM-provided delta_ns to advance the
215 * rq clock:
216 */
217 __raw_spin_lock(&scd->lock);
218 scd->prev_raw = now;
219 scd->clock += delta_ns;
220 __raw_spin_unlock(&scd->lock);
221
222 touch_softlockup_watchdog();
223}
224EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
225
226#endif
227
228/*
229 * Scheduler clock - returns current time in nanosec units.
230 * This is default implementation.
231 * Architectures and sub-architectures can override this.
232 */
233unsigned long long __attribute__((weak)) sched_clock(void)
234{
235 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
236}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e88..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
204 PN(next_balance); 204 PN(next_balance);
205 P(curr->pid); 205 P(curr->pid);
206 PN(clock); 206 PN(clock);
207 PN(idle_clock);
208 PN(prev_clock_raw);
209 P(clock_warps);
210 P(clock_overflows);
211 P(clock_underflows);
212 P(clock_deep_idle_events);
213 PN(clock_max_delta);
214 P(cpu_load[0]); 207 P(cpu_load[0]);
215 P(cpu_load[1]); 208 P(cpu_load[1]);
216 P(cpu_load[2]); 209 P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf2..e24ecd39c4b8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -662,10 +662,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
662 if (!initial) { 662 if (!initial) {
663 /* sleeps upto a single latency don't count. */ 663 /* sleeps upto a single latency don't count. */
664 if (sched_feat(NEW_FAIR_SLEEPERS)) { 664 if (sched_feat(NEW_FAIR_SLEEPERS)) {
665 unsigned long thresh = sysctl_sched_latency;
666
667 /*
668 * convert the sleeper threshold into virtual time
669 */
665 if (sched_feat(NORMALIZED_SLEEPER)) 670 if (sched_feat(NORMALIZED_SLEEPER))
666 vruntime -= calc_delta_weight(sysctl_sched_latency, se); 671 thresh = calc_delta_fair(thresh, se);
667 else 672
668 vruntime -= sysctl_sched_latency; 673 vruntime -= thresh;
669 } 674 }
670 675
671 /* ensure we never gain time by being placed backwards. */ 676 /* ensure we never gain time by being placed backwards. */
@@ -682,6 +687,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
682 * Update run-time statistics of the 'current'. 687 * Update run-time statistics of the 'current'.
683 */ 688 */
684 update_curr(cfs_rq); 689 update_curr(cfs_rq);
690 account_entity_enqueue(cfs_rq, se);
685 691
686 if (wakeup) { 692 if (wakeup) {
687 place_entity(cfs_rq, se, 0); 693 place_entity(cfs_rq, se, 0);
@@ -692,7 +698,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
692 check_spread(cfs_rq, se); 698 check_spread(cfs_rq, se);
693 if (se != cfs_rq->curr) 699 if (se != cfs_rq->curr)
694 __enqueue_entity(cfs_rq, se); 700 __enqueue_entity(cfs_rq, se);
695 account_entity_enqueue(cfs_rq, se);
696} 701}
697 702
698static void update_avg(u64 *avg, u64 sample) 703static void update_avg(u64 *avg, u64 sample)
@@ -841,8 +846,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
841 * queued ticks are scheduled to match the slice, so don't bother 846 * queued ticks are scheduled to match the slice, so don't bother
842 * validating it and just reschedule. 847 * validating it and just reschedule.
843 */ 848 */
844 if (queued) 849 if (queued) {
845 return resched_task(rq_of(cfs_rq)->curr); 850 resched_task(rq_of(cfs_rq)->curr);
851 return;
852 }
846 /* 853 /*
847 * don't let the period tick interfere with the hrtick preemption 854 * don't let the period tick interfere with the hrtick preemption
848 */ 855 */
@@ -957,7 +964,7 @@ static void yield_task_fair(struct rq *rq)
957 return; 964 return;
958 965
959 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 966 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
960 __update_rq_clock(rq); 967 update_rq_clock(rq);
961 /* 968 /*
962 * Update run-time statistics of the 'current'. 969 * Update run-time statistics of the 'current'.
963 */ 970 */
@@ -1007,7 +1014,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1007 * sibling runqueue info. This will avoid the checks and cache miss 1014 * sibling runqueue info. This will avoid the checks and cache miss
1008 * penalities associated with that. 1015 * penalities associated with that.
1009 */ 1016 */
1010 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) 1017 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1011 return cpu; 1018 return cpu;
1012 1019
1013 for_each_domain(cpu, sd) { 1020 for_each_domain(cpu, sd) {
@@ -1611,30 +1618,6 @@ static const struct sched_class fair_sched_class = {
1611}; 1618};
1612 1619
1613#ifdef CONFIG_SCHED_DEBUG 1620#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1638static void print_cfs_stats(struct seq_file *m, int cpu) 1621static void print_cfs_stats(struct seq_file *m, int cpu)
1639{ 1622{
1640 struct cfs_rq *cfs_rq; 1623 struct cfs_rq *cfs_rq;
@@ -1642,9 +1625,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1642 rcu_read_lock(); 1625 rcu_read_lock();
1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1626 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1644 print_cfs_rq(m, cpu, cfs_rq); 1627 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1648 rcu_read_unlock(); 1628 rcu_read_unlock();
1649} 1629}
1650#endif 1630#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa375633..3a4f92dbbe66 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
99/* 99/*
100 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
101 */ 101 */
102const struct sched_class idle_sched_class = { 102static const struct sched_class idle_sched_class = {
103 /* .next is NULL */ 103 /* .next is NULL */
104 /* no enqueue/yield_task for idle tasks */ 104 /* no enqueue/yield_task for idle tasks */
105 105
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f05..060e87b0cb1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
1098 } 1098 }
1099} 1099}
1100 1100
1101 1101/*
1102 * If we are not running and we are not going to reschedule soon, we should
1103 * try to push tasks away now
1104 */
1102static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1105static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1103{ 1106{
1104 if (!task_running(rq, p) && 1107 if (!task_running(rq, p) &&
1105 (p->prio >= rq->rt.highest_prio) && 1108 !test_tsk_need_resched(rq->curr) &&
1106 rq->rt.overloaded) 1109 rq->rt.overloaded)
1107 push_rt_tasks(rq); 1110 push_rt_tasks(rq);
1108} 1111}
@@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq)
1309 p->se.exec_start = rq->clock; 1312 p->se.exec_start = rq->clock;
1310} 1313}
1311 1314
1312const struct sched_class rt_sched_class = { 1315static const struct sched_class rt_sched_class = {
1313 .next = &fair_sched_class, 1316 .next = &fair_sched_class,
1314 .enqueue_task = enqueue_task_rt, 1317 .enqueue_task = enqueue_task_rt,
1315 .dequeue_task = dequeue_task_rt, 1318 .dequeue_task = dequeue_task_rt,
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..5e41217239e8 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,10 +54,9 @@ void down(struct semaphore *sem)
54 unsigned long flags; 54 unsigned long flags;
55 55
56 spin_lock_irqsave(&sem->lock, flags); 56 spin_lock_irqsave(&sem->lock, flags);
57 if (likely(sem->count > 0)) 57 if (unlikely(!sem->count))
58 sem->count--;
59 else
60 __down(sem); 58 __down(sem);
59 sem->count--;
61 spin_unlock_irqrestore(&sem->lock, flags); 60 spin_unlock_irqrestore(&sem->lock, flags);
62} 61}
63EXPORT_SYMBOL(down); 62EXPORT_SYMBOL(down);
@@ -77,10 +76,10 @@ int down_interruptible(struct semaphore *sem)
77 int result = 0; 76 int result = 0;
78 77
79 spin_lock_irqsave(&sem->lock, flags); 78 spin_lock_irqsave(&sem->lock, flags);
80 if (likely(sem->count > 0)) 79 if (unlikely(!sem->count))
81 sem->count--;
82 else
83 result = __down_interruptible(sem); 80 result = __down_interruptible(sem);
81 if (!result)
82 sem->count--;
84 spin_unlock_irqrestore(&sem->lock, flags); 83 spin_unlock_irqrestore(&sem->lock, flags);
85 84
86 return result; 85 return result;
@@ -103,10 +102,10 @@ int down_killable(struct semaphore *sem)
103 int result = 0; 102 int result = 0;
104 103
105 spin_lock_irqsave(&sem->lock, flags); 104 spin_lock_irqsave(&sem->lock, flags);
106 if (likely(sem->count > 0)) 105 if (unlikely(!sem->count))
107 sem->count--;
108 else
109 result = __down_killable(sem); 106 result = __down_killable(sem);
107 if (!result)
108 sem->count--;
110 spin_unlock_irqrestore(&sem->lock, flags); 109 spin_unlock_irqrestore(&sem->lock, flags);
111 110
112 return result; 111 return result;
@@ -157,10 +156,10 @@ int down_timeout(struct semaphore *sem, long jiffies)
157 int result = 0; 156 int result = 0;
158 157
159 spin_lock_irqsave(&sem->lock, flags); 158 spin_lock_irqsave(&sem->lock, flags);
160 if (likely(sem->count > 0)) 159 if (unlikely(!sem->count))
161 sem->count--;
162 else
163 result = __down_timeout(sem, jiffies); 160 result = __down_timeout(sem, jiffies);
161 if (!result)
162 sem->count--;
164 spin_unlock_irqrestore(&sem->lock, flags); 163 spin_unlock_irqrestore(&sem->lock, flags);
165 164
166 return result; 165 return result;
@@ -179,9 +178,8 @@ void up(struct semaphore *sem)
179 unsigned long flags; 178 unsigned long flags;
180 179
181 spin_lock_irqsave(&sem->lock, flags); 180 spin_lock_irqsave(&sem->lock, flags);
182 if (likely(list_empty(&sem->wait_list))) 181 sem->count++;
183 sem->count++; 182 if (unlikely(!list_empty(&sem->wait_list)))
184 else
185 __up(sem); 183 __up(sem);
186 spin_unlock_irqrestore(&sem->lock, flags); 184 spin_unlock_irqrestore(&sem->lock, flags);
187} 185}
@@ -192,7 +190,6 @@ EXPORT_SYMBOL(up);
192struct semaphore_waiter { 190struct semaphore_waiter {
193 struct list_head list; 191 struct list_head list;
194 struct task_struct *task; 192 struct task_struct *task;
195 int up;
196}; 193};
197 194
198/* 195/*
@@ -205,33 +202,34 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
205{ 202{
206 struct task_struct *task = current; 203 struct task_struct *task = current;
207 struct semaphore_waiter waiter; 204 struct semaphore_waiter waiter;
205 int ret = 0;
208 206
209 list_add_tail(&waiter.list, &sem->wait_list);
210 waiter.task = task; 207 waiter.task = task;
211 waiter.up = 0; 208 list_add_tail(&waiter.list, &sem->wait_list);
212 209
213 for (;;) { 210 for (;;) {
214 if (state == TASK_INTERRUPTIBLE && signal_pending(task)) 211 if (state == TASK_INTERRUPTIBLE && signal_pending(task)) {
215 goto interrupted; 212 ret = -EINTR;
216 if (state == TASK_KILLABLE && fatal_signal_pending(task)) 213 break;
217 goto interrupted; 214 }
218 if (timeout <= 0) 215 if (state == TASK_KILLABLE && fatal_signal_pending(task)) {
219 goto timed_out; 216 ret = -EINTR;
217 break;
218 }
219 if (timeout <= 0) {
220 ret = -ETIME;
221 break;
222 }
220 __set_task_state(task, state); 223 __set_task_state(task, state);
221 spin_unlock_irq(&sem->lock); 224 spin_unlock_irq(&sem->lock);
222 timeout = schedule_timeout(timeout); 225 timeout = schedule_timeout(timeout);
223 spin_lock_irq(&sem->lock); 226 spin_lock_irq(&sem->lock);
224 if (waiter.up) 227 if (sem->count > 0)
225 return 0; 228 break;
226 } 229 }
227 230
228 timed_out:
229 list_del(&waiter.list);
230 return -ETIME;
231
232 interrupted:
233 list_del(&waiter.list); 231 list_del(&waiter.list);
234 return -EINTR; 232 return ret;
235} 233}
236 234
237static noinline void __sched __down(struct semaphore *sem) 235static noinline void __sched __down(struct semaphore *sem)
@@ -258,7 +256,5 @@ static noinline void __sched __up(struct semaphore *sem)
258{ 256{
259 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, 257 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
260 struct semaphore_waiter, list); 258 struct semaphore_waiter, list);
261 list_del(&waiter->list);
262 waiter->up = 1;
263 wake_up_process(waiter->task); 259 wake_up_process(waiter->task);
264} 260}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 73961f35fdc8..dadde5361f32 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
471/* 471/*
472 * Sysfs setup bits: 472 * Sysfs setup bits:
473 */ 473 */
474static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, 474static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
475 sysfs_override_clocksource); 475 sysfs_override_clocksource);
476 476
477static SYSDEV_ATTR(available_clocksource, 0600, 477static SYSDEV_ATTR(available_clocksource, 0444,
478 sysfs_show_available_clocksources, NULL); 478 sysfs_show_available_clocksources, NULL);
479 479
480static struct sysdev_class clocksource_sysclass = { 480static struct sysdev_class clocksource_sysclass = {