aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-17 07:06:17 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-17 07:06:17 -0400
commita3d06cc6aa3e765dc2bf98626f87272dcf641dca (patch)
treeaa3e49b58f08d6c0ea55cdca4fb5e6c8ba6ae333 /kernel
parent0990b1c65729012a63e0eeca93aaaafea4e9a064 (diff)
parent65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
Merge branch 'linus' into perfcounters/core
Conflicts: arch/x86/include/asm/kmap_types.h include/linux/mm.h include/asm-generic/kmap_types.h Merge reason: We crossed changes with kmap_types.h cleanups in mainline. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpuset.c260
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c58
-rw-r--r--kernel/kallsyms.c134
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/printk.c33
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c11
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/slow-work.c23
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/sys.c283
-rw-r--r--kernel/sysctl.c22
-rw-r--r--kernel/time/clockevents.c14
-rw-r--r--kernel/time/clocksource.c20
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-oneshot.c17
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/timer.c52
-rw-r--r--kernel/trace/Kconfig10
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--kernel/trace/trace.c23
-rw-r--r--kernel/trace/trace_sysprof.c3
-rw-r--r--kernel/user.c67
31 files changed, 804 insertions, 588 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 90b53f6dc226..9df4501cb921 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o
14obj-y += groups.o
14 15
15ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5a7e17474ee..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
331 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
332} 306}
333 307
334/** 308/*
335 * cpuset_update_task_memory_state - update task memory placement 309 * update task's spread flag if cpuset's page/slab spread flag is set
336 * 310 *
337 * If the current tasks cpusets mems_allowed changed behind our 311 * Called with callback_mutex/cgroup_mutex held
338 * backs, update current->mems_allowed, mems_generation and task NUMA
339 * mempolicy to the new value.
340 *
341 * Task mempolicy is updated by rebinding it relative to the
342 * current->cpuset if a task has its memory placement changed.
343 * Do not call this routine if in_interrupt().
344 *
345 * Call without callback_mutex or task_lock() held. May be
346 * called with or without cgroup_mutex held. Thanks in part to
347 * 'the_top_cpuset_hack', the task's cpuset pointer will never
348 * be NULL. This routine also might acquire callback_mutex during
349 * call.
350 *
351 * Reading current->cpuset->mems_generation doesn't need task_lock
352 * to guard the current->cpuset derefence, because it is guarded
353 * from concurrent freeing of current->cpuset using RCU.
354 *
355 * The rcu_dereference() is technically probably not needed,
356 * as I don't actually mind if I see a new cpuset pointer but
357 * an old value of mems_generation. However this really only
358 * matters on alpha systems using cpusets heavily. If I dropped
359 * that rcu_dereference(), it would save them a memory barrier.
360 * For all other arch's, rcu_dereference is a no-op anyway, and for
361 * alpha systems not using cpusets, another planned optimization,
362 * avoiding the rcu critical section for tasks in the root cpuset
363 * which is statically allocated, so can't vanish, will make this
364 * irrelevant. Better to use RCU as intended, than to engage in
365 * some cute trick to save a memory barrier that is impossible to
366 * test, for alpha systems using cpusets heavily, which might not
367 * even exist.
368 *
369 * This routine is needed to update the per-task mems_allowed data,
370 * within the tasks context, when it is trying to allocate memory
371 * (in various mm/mempolicy.c routines) and notices that some other
372 * task has been modifying its cpuset.
373 */ 312 */
374 313static void cpuset_update_task_spread_flag(struct cpuset *cs,
375void cpuset_update_task_memory_state(void) 314 struct task_struct *tsk)
376{ 315{
377 int my_cpusets_mem_gen; 316 if (is_spread_page(cs))
378 struct task_struct *tsk = current; 317 tsk->flags |= PF_SPREAD_PAGE;
379 struct cpuset *cs; 318 else
380 319 tsk->flags &= ~PF_SPREAD_PAGE;
381 rcu_read_lock(); 320 if (is_spread_slab(cs))
382 my_cpusets_mem_gen = task_cs(tsk)->mems_generation; 321 tsk->flags |= PF_SPREAD_SLAB;
383 rcu_read_unlock(); 322 else
384 323 tsk->flags &= ~PF_SPREAD_SLAB;
385 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
386 mutex_lock(&callback_mutex);
387 task_lock(tsk);
388 cs = task_cs(tsk); /* Maybe changed when task not locked */
389 guarantee_online_mems(cs, &tsk->mems_allowed);
390 tsk->cpuset_mems_generation = cs->mems_generation;
391 if (is_spread_page(cs))
392 tsk->flags |= PF_SPREAD_PAGE;
393 else
394 tsk->flags &= ~PF_SPREAD_PAGE;
395 if (is_spread_slab(cs))
396 tsk->flags |= PF_SPREAD_SLAB;
397 else
398 tsk->flags &= ~PF_SPREAD_SLAB;
399 task_unlock(tsk);
400 mutex_unlock(&callback_mutex);
401 mpol_rebind_task(tsk, &tsk->mems_allowed);
402 }
403} 324}
404 325
405/* 326/*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1007 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1008 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1009 * migrating memory region. 930 * migrating memory region.
1010 *
1011 * We call cpuset_update_task_memory_state() before hacking
1012 * our tasks mems_allowed, so that we are assured of being in
1013 * sync with our tasks cpuset, and in particular, callbacks to
1014 * cpuset_update_task_memory_state() from nested page allocations
1015 * won't see any mismatch of our cpuset and task mems_generation
1016 * values, so won't overwrite our hacked tasks mems_allowed
1017 * nodemask.
1018 */ 931 */
1019 932
1020static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1022{ 935{
1023 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1024 937
1025 cpuset_update_task_memory_state();
1026
1027 mutex_lock(&callback_mutex);
1028 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1029 mutex_unlock(&callback_mutex);
1030 939
1031 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1032 941
1033 mutex_lock(&callback_mutex);
1034 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1035 mutex_unlock(&callback_mutex);
1036} 943}
1037 944
1038/* 945/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1041 */ 969 */
1042static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1046 struct cpuset *cs; 974 struct cpuset *cs;
1047 int migrate; 975 int migrate;
1048 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1049 985
1050 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1051 if (!mm) 987 if (!mm)
1052 return; 988 return;
1053 989
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1056 991
1057 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1104/* 1039/*
1105 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1106 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1107 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1108 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1109 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1110 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1111 * 1046 *
1112 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1113 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1160 1095
1161 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1162 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1163 cs->mems_generation = cpuset_mems_generation++;
1164 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1165 1099
1166 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1193} 1127}
1194 1128
1195/* 1129/*
1130 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1131 * @tsk: task to be updated
1132 * @scan: struct cgroup_scanner containing the cgroup of the task
1133 *
1134 * Called by cgroup_scan_tasks() for each task in a cgroup.
1135 *
1136 * We don't need to re-check for the cgroup/cpuset membership, since we're
1137 * holding cgroup_lock() at this point.
1138 */
1139static void cpuset_change_flag(struct task_struct *tsk,
1140 struct cgroup_scanner *scan)
1141{
1142 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1143}
1144
1145/*
1146 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1147 * @cs: the cpuset in which each task's spread flags needs to be changed
1148 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1149 *
1150 * Called with cgroup_mutex held
1151 *
1152 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1153 * calling callback functions for each.
1154 *
1155 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1156 * if @heap != NULL.
1157 */
1158static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1159{
1160 struct cgroup_scanner scan;
1161
1162 scan.cg = cs->css.cgroup;
1163 scan.test_task = NULL;
1164 scan.process_task = cpuset_change_flag;
1165 scan.heap = heap;
1166 cgroup_scan_tasks(&scan);
1167}
1168
1169/*
1196 * update_flag - read a 0 or a 1 in a file and update associated flag 1170 * update_flag - read a 0 or a 1 in a file and update associated flag
1197 * bit: the bit to update (see cpuset_flagbits_t) 1171 * bit: the bit to update (see cpuset_flagbits_t)
1198 * cs: the cpuset to update 1172 * cs: the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1205 int turning_on) 1179 int turning_on)
1206{ 1180{
1207 struct cpuset *trialcs; 1181 struct cpuset *trialcs;
1208 int err;
1209 int balance_flag_changed; 1182 int balance_flag_changed;
1183 int spread_flag_changed;
1184 struct ptr_heap heap;
1185 int err;
1210 1186
1211 trialcs = alloc_trial_cpuset(cs); 1187 trialcs = alloc_trial_cpuset(cs);
1212 if (!trialcs) 1188 if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1221 if (err < 0) 1197 if (err < 0)
1222 goto out; 1198 goto out;
1223 1199
1200 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1201 if (err < 0)
1202 goto out;
1203
1224 balance_flag_changed = (is_sched_load_balance(cs) != 1204 balance_flag_changed = (is_sched_load_balance(cs) !=
1225 is_sched_load_balance(trialcs)); 1205 is_sched_load_balance(trialcs));
1226 1206
1207 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1208 || (is_spread_page(cs) != is_spread_page(trialcs)));
1209
1227 mutex_lock(&callback_mutex); 1210 mutex_lock(&callback_mutex);
1228 cs->flags = trialcs->flags; 1211 cs->flags = trialcs->flags;
1229 mutex_unlock(&callback_mutex); 1212 mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1231 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1214 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1232 async_rebuild_sched_domains(); 1215 async_rebuild_sched_domains();
1233 1216
1217 if (spread_flag_changed)
1218 update_tasks_flags(cs, &heap);
1219 heap_free(&heap);
1234out: 1220out:
1235 free_trial_cpuset(trialcs); 1221 free_trial_cpuset(trialcs);
1236 return err; 1222 return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1372 1358
1373 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1374 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1375 } else { 1362 } else {
1376 mutex_lock(&callback_mutex);
1377 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1378 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1379 } 1365 }
1380 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1381 if (err) 1367 if (err)
1382 return; 1368 return;
1383 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1373 cpuset_update_task_spread_flag(cs, tsk);
1374
1384 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
1385 to = cs->mems_allowed; 1376 to = cs->mems_allowed;
1386 mm = get_task_mm(tsk); 1377 mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1442 break; 1433 break;
1443 case FILE_SPREAD_PAGE: 1434 case FILE_SPREAD_PAGE:
1444 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1435 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1445 cs->mems_generation = cpuset_mems_generation++;
1446 break; 1436 break;
1447 case FILE_SPREAD_SLAB: 1437 case FILE_SPREAD_SLAB:
1448 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1438 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1449 cs->mems_generation = cpuset_mems_generation++;
1450 break; 1439 break;
1451 default: 1440 default:
1452 retval = -EINVAL; 1441 retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1786 struct cpuset *parent; 1775 struct cpuset *parent;
1787 1776
1788 if (!cont->parent) { 1777 if (!cont->parent) {
1789 /* This is early initialization for the top cgroup */
1790 top_cpuset.mems_generation = cpuset_mems_generation++;
1791 return &top_cpuset.css; 1778 return &top_cpuset.css;
1792 } 1779 }
1793 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1799 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1800 } 1787 }
1801 1788
1802 cpuset_update_task_memory_state();
1803 cs->flags = 0; 1789 cs->flags = 0;
1804 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1805 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1808 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1809 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1810 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1811 cs->mems_generation = cpuset_mems_generation++;
1812 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1813 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1814 1799
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1827{ 1812{
1828 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1829 1814
1830 cpuset_update_task_memory_state();
1831
1832 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1833 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1834 1817
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1849 .early_init = 1, 1832 .early_init = 1,
1850}; 1833};
1851 1834
1852/*
1853 * cpuset_init_early - just enough so that the calls to
1854 * cpuset_update_task_memory_state() in early init code
1855 * are harmless.
1856 */
1857
1858int __init cpuset_init_early(void)
1859{
1860 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0;
1864}
1865
1866
1867/** 1835/**
1868 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1869 * 1837 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
1874{ 1842{
1875 int err = 0; 1843 int err = 0;
1876 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1877 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1878 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1879 1850
1880 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1881 top_cpuset.mems_generation = cpuset_mems_generation++;
1882 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1883 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1884 1854
diff --git a/kernel/fork.c b/kernel/fork.c
index 4430eb1376f2..be022c200da6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages)
178 /* create a slab on which task_structs can be allocated */ 178 /* create a slab on which task_structs can be allocated */
179 task_struct_cachep = 179 task_struct_cachep =
180 kmem_cache_create("task_struct", sizeof(struct task_struct), 180 kmem_cache_create("task_struct", sizeof(struct task_struct),
181 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 181 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
182#endif 182#endif
183 183
184 /* do the arch specific task caches init */ 184 /* do the arch specific task caches init */
@@ -1470,20 +1470,20 @@ void __init proc_caches_init(void)
1470{ 1470{
1471 sighand_cachep = kmem_cache_create("sighand_cache", 1471 sighand_cachep = kmem_cache_create("sighand_cache",
1472 sizeof(struct sighand_struct), 0, 1472 sizeof(struct sighand_struct), 0,
1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1474 sighand_ctor); 1474 SLAB_NOTRACK, sighand_ctor);
1475 signal_cachep = kmem_cache_create("signal_cache", 1475 signal_cachep = kmem_cache_create("signal_cache",
1476 sizeof(struct signal_struct), 0, 1476 sizeof(struct signal_struct), 0,
1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1478 files_cachep = kmem_cache_create("files_cache", 1478 files_cachep = kmem_cache_create("files_cache",
1479 sizeof(struct files_struct), 0, 1479 sizeof(struct files_struct), 0,
1480 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1480 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1481 fs_cachep = kmem_cache_create("fs_cache", 1481 fs_cachep = kmem_cache_create("fs_cache",
1482 sizeof(struct fs_struct), 0, 1482 sizeof(struct fs_struct), 0,
1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1484 mm_cachep = kmem_cache_create("mm_struct", 1484 mm_cachep = kmem_cache_create("mm_struct",
1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1487 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1487 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1488 mmap_init(); 1488 mmap_init();
1489} 1489}
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
1/*
2 * Supplementary group IDs
3 */
4#include <linux/cred.h>
5#include <linux/module.h>
6#include <linux/slab.h>
7#include <linux/security.h>
8#include <linux/syscalls.h>
9#include <asm/uaccess.h>
10
11/* init to 2 - one for init_task, one to ensure it is never freed */
12struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
13
14struct group_info *groups_alloc(int gidsetsize)
15{
16 struct group_info *group_info;
17 int nblocks;
18 int i;
19
20 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
21 /* Make sure we always allocate at least one indirect block pointer */
22 nblocks = nblocks ? : 1;
23 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
24 if (!group_info)
25 return NULL;
26 group_info->ngroups = gidsetsize;
27 group_info->nblocks = nblocks;
28 atomic_set(&group_info->usage, 1);
29
30 if (gidsetsize <= NGROUPS_SMALL)
31 group_info->blocks[0] = group_info->small_block;
32 else {
33 for (i = 0; i < nblocks; i++) {
34 gid_t *b;
35 b = (void *)__get_free_page(GFP_USER);
36 if (!b)
37 goto out_undo_partial_alloc;
38 group_info->blocks[i] = b;
39 }
40 }
41 return group_info;
42
43out_undo_partial_alloc:
44 while (--i >= 0) {
45 free_page((unsigned long)group_info->blocks[i]);
46 }
47 kfree(group_info);
48 return NULL;
49}
50
51EXPORT_SYMBOL(groups_alloc);
52
53void groups_free(struct group_info *group_info)
54{
55 if (group_info->blocks[0] != group_info->small_block) {
56 int i;
57 for (i = 0; i < group_info->nblocks; i++)
58 free_page((unsigned long)group_info->blocks[i]);
59 }
60 kfree(group_info);
61}
62
63EXPORT_SYMBOL(groups_free);
64
65/* export the group_info to a user-space array */
66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info)
68{
69 int i;
70 unsigned int count = group_info->ngroups;
71
72 for (i = 0; i < group_info->nblocks; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
74 unsigned int len = cp_count * sizeof(*grouplist);
75
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 }
82 return 0;
83}
84
85/* fill a group_info from a user-space array - it must be allocated already */
86static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist)
88{
89 int i;
90 unsigned int count = group_info->ngroups;
91
92 for (i = 0; i < group_info->nblocks; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
94 unsigned int len = cp_count * sizeof(*grouplist);
95
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT;
98
99 grouplist += NGROUPS_PER_BLOCK;
100 count -= cp_count;
101 }
102 return 0;
103}
104
105/* a simple Shell sort */
106static void groups_sort(struct group_info *group_info)
107{
108 int base, max, stride;
109 int gidsetsize = group_info->ngroups;
110
111 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
112 ; /* nothing */
113 stride /= 3;
114
115 while (stride) {
116 max = gidsetsize - stride;
117 for (base = 0; base < max; base++) {
118 int left = base;
119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right);
121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left);
125 right = left;
126 left -= stride;
127 }
128 GROUP_AT(group_info, right) = tmp;
129 }
130 stride /= 3;
131 }
132}
133
134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp)
136{
137 unsigned int left, right;
138
139 if (!group_info)
140 return 0;
141
142 left = 0;
143 right = group_info->ngroups;
144 while (left < right) {
145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid);
147 if (cmp > 0)
148 left = mid + 1;
149 else if (cmp < 0)
150 right = mid;
151 else
152 return 1;
153 }
154 return 0;
155}
156
157/**
158 * set_groups - Change a group subscription in a set of credentials
159 * @new: The newly prepared set of credentials to alter
160 * @group_info: The group list to install
161 *
162 * Validate a group subscription and, if valid, insert it into a set
163 * of credentials.
164 */
165int set_groups(struct cred *new, struct group_info *group_info)
166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info);
174 groups_sort(group_info);
175 get_group_info(group_info);
176 new->group_info = group_info;
177 return 0;
178}
179
180EXPORT_SYMBOL(set_groups);
181
182/**
183 * set_current_groups - Change current's group subscription
184 * @group_info: The group list to impose
185 *
186 * Validate a group subscription and, if valid, impose it upon current's task
187 * security record.
188 */
189int set_current_groups(struct group_info *group_info)
190{
191 struct cred *new;
192 int ret;
193
194 new = prepare_creds();
195 if (!new)
196 return -ENOMEM;
197
198 ret = set_groups(new, group_info);
199 if (ret < 0) {
200 abort_creds(new);
201 return ret;
202 }
203
204 return commit_creds(new);
205}
206
207EXPORT_SYMBOL(set_current_groups);
208
209SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
210{
211 const struct cred *cred = current_cred();
212 int i;
213
214 if (gidsetsize < 0)
215 return -EINVAL;
216
217 /* no need to grab task_lock here; it cannot change */
218 i = cred->group_info->ngroups;
219 if (gidsetsize) {
220 if (i > gidsetsize) {
221 i = -EINVAL;
222 goto out;
223 }
224 if (groups_to_user(grouplist, cred->group_info)) {
225 i = -EFAULT;
226 goto out;
227 }
228 }
229out:
230 return i;
231}
232
233/*
234 * SMP: Our groups are copy-on-write. We can set them safely
235 * without another task interfering.
236 */
237
238SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
239{
240 struct group_info *group_info;
241 int retval;
242
243 if (!capable(CAP_SETGID))
244 return -EPERM;
245 if ((unsigned)gidsetsize > NGROUPS_MAX)
246 return -EINVAL;
247
248 group_info = groups_alloc(gidsetsize);
249 if (!group_info)
250 return -ENOMEM;
251 retval = groups_from_user(group_info, grouplist);
252 if (retval) {
253 put_group_info(group_info);
254 return retval;
255 }
256
257 retval = set_current_groups(group_info);
258 put_group_info(group_info);
259
260 return retval;
261}
262
263/*
264 * Check whether we're fsgid/egid or in the supplemental group..
265 */
266int in_group_p(gid_t grp)
267{
268 const struct cred *cred = current_cred();
269 int retval = 1;
270
271 if (grp != cred->fsgid)
272 retval = groups_search(cred->group_info, grp);
273 return retval;
274}
275
276EXPORT_SYMBOL(in_group_p);
277
278int in_egroup_p(gid_t grp)
279{
280 const struct cred *cred = current_cred();
281 int retval = 1;
282
283 if (grp != cred->egid)
284 retval = groups_search(cred->group_info, grp);
285 return retval;
286}
287
288EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c19583..b675a67c9ac3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h>
47#include <linux/timer.h>
46 48
47#include <asm/uaccess.h> 49#include <asm/uaccess.h>
48 50
@@ -193,12 +195,24 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
193 * Switch the timer base to the current CPU when possible. 195 * Switch the timer base to the current CPU when possible.
194 */ 196 */
195static inline struct hrtimer_clock_base * 197static inline struct hrtimer_clock_base *
196switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) 198switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
199 int pinned)
197{ 200{
198 struct hrtimer_clock_base *new_base; 201 struct hrtimer_clock_base *new_base;
199 struct hrtimer_cpu_base *new_cpu_base; 202 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1;
204
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
200 213
201 new_cpu_base = &__get_cpu_var(hrtimer_bases); 214again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
202 new_base = &new_cpu_base->clock_base[base->index]; 216 new_base = &new_cpu_base->clock_base[base->index];
203 217
204 if (base != new_base) { 218 if (base != new_base) {
@@ -218,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
218 timer->base = NULL; 232 timer->base = NULL;
219 spin_unlock(&base->cpu_base->lock); 233 spin_unlock(&base->cpu_base->lock);
220 spin_lock(&new_base->cpu_base->lock); 234 spin_lock(&new_base->cpu_base->lock);
235
236 /* Optimized away for NOHZ=n SMP=n */
237 if (cpu == preferred_cpu) {
238 /* Calculate clock monotonic expiry time */
239#ifdef CONFIG_HIGH_RES_TIMERS
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
241 new_base->offset);
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 }
221 timer->base = new_base; 269 timer->base = new_base;
222 } 270 }
223 return new_base; 271 return new_base;
@@ -235,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
235 return base; 283 return base;
236} 284}
237 285
238# define switch_hrtimer_base(t, b) (b) 286# define switch_hrtimer_base(t, b, p) (b)
239 287
240#endif /* !CONFIG_SMP */ 288#endif /* !CONFIG_SMP */
241 289
@@ -907,9 +955,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
907 ret = remove_hrtimer(timer, base); 955 ret = remove_hrtimer(timer, base);
908 956
909 /* Switch the timer base, if necessary: */ 957 /* Switch the timer base, if necessary: */
910 new_base = switch_hrtimer_base(timer, base); 958 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
911 959
912 if (mode == HRTIMER_MODE_REL) { 960 if (mode & HRTIMER_MODE_REL) {
913 tim = ktime_add_safe(tim, new_base->get_time()); 961 tim = ktime_add_safe(tim, new_base->get_time());
914 /* 962 /*
915 * CONFIG_TIME_LOW_RES is a temporary way for architectures 963 * CONFIG_TIME_LOW_RES is a temporary way for architectures
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc7..3a29dbe7898e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33/*
34 * These will be re-linked against their real values
35 * during the second link stage.
36 */
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 37extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak)); 38extern const u8 kallsyms_names[] __attribute__((weak));
36 39
37/* tell the compiler that the count isn't in the small data section if the arch 40/*
38 * has one (eg: FRV) 41 * Tell the compiler that the count isn't in the small data section if the arch
42 * has one (eg: FRV).
39 */ 43 */
40extern const unsigned long kallsyms_num_syms 44extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 45__attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
75 return is_kernel_text(addr) || is_kernel_inittext(addr); 79 return is_kernel_text(addr) || is_kernel_inittext(addr);
76} 80}
77 81
78/* expand a compressed symbol data into the resulting uncompressed string, 82/*
79 given the offset to where the symbol is in the compressed stream */ 83 * Expand a compressed symbol data into the resulting uncompressed string,
84 * given the offset to where the symbol is in the compressed stream.
85 */
80static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 86static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
81{ 87{
82 int len, skipped_first = 0; 88 int len, skipped_first = 0;
83 const u8 *tptr, *data; 89 const u8 *tptr, *data;
84 90
85 /* get the compressed symbol length from the first symbol byte */ 91 /* Get the compressed symbol length from the first symbol byte. */
86 data = &kallsyms_names[off]; 92 data = &kallsyms_names[off];
87 len = *data; 93 len = *data;
88 data++; 94 data++;
89 95
90 /* update the offset to return the offset for the next symbol on 96 /*
91 * the compressed stream */ 97 * Update the offset to return the offset for the next symbol on
98 * the compressed stream.
99 */
92 off += len + 1; 100 off += len + 1;
93 101
94 /* for every byte on the compressed symbol data, copy the table 102 /*
95 entry for that byte */ 103 * For every byte on the compressed symbol data, copy the table
96 while(len) { 104 * entry for that byte.
97 tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; 105 */
106 while (len) {
107 tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
98 data++; 108 data++;
99 len--; 109 len--;
100 110
101 while (*tptr) { 111 while (*tptr) {
102 if(skipped_first) { 112 if (skipped_first) {
103 *result = *tptr; 113 *result = *tptr;
104 result++; 114 result++;
105 } else 115 } else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
110 120
111 *result = '\0'; 121 *result = '\0';
112 122
113 /* return to offset to the next symbol */ 123 /* Return to offset to the next symbol. */
114 return off; 124 return off;
115} 125}
116 126
117/* get symbol type information. This is encoded as a single char at the 127/*
118 * begining of the symbol name */ 128 * Get symbol type information. This is encoded as a single char at the
129 * beginning of the symbol name.
130 */
119static char kallsyms_get_symbol_type(unsigned int off) 131static char kallsyms_get_symbol_type(unsigned int off)
120{ 132{
121 /* get just the first code, look it up in the token table, and return the 133 /*
122 * first char from this token */ 134 * Get just the first code, look it up in the token table,
123 return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; 135 * and return the first char from this token.
136 */
137 return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
124} 138}
125 139
126 140
127/* find the offset on the compressed stream given and index in the 141/*
128 * kallsyms array */ 142 * Find the offset on the compressed stream given and index in the
143 * kallsyms array.
144 */
129static unsigned int get_symbol_offset(unsigned long pos) 145static unsigned int get_symbol_offset(unsigned long pos)
130{ 146{
131 const u8 *name; 147 const u8 *name;
132 int i; 148 int i;
133 149
134 /* use the closest marker we have. We have markers every 256 positions, 150 /*
135 * so that should be close enough */ 151 * Use the closest marker we have. We have markers every 256 positions,
136 name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; 152 * so that should be close enough.
153 */
154 name = &kallsyms_names[kallsyms_markers[pos >> 8]];
137 155
138 /* sequentially scan all the symbols up to the point we're searching for. 156 /*
139 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we 157 * Sequentially scan all the symbols up to the point we're searching
140 * just need to add the len to the current pointer for every symbol we 158 * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
141 * wish to skip */ 159 * so we just need to add the len to the current pointer for every
142 for(i = 0; i < (pos&0xFF); i++) 160 * symbol we wish to skip.
161 */
162 for (i = 0; i < (pos & 0xFF); i++)
143 name = name + (*name) + 1; 163 name = name + (*name) + 1;
144 164
145 return name - kallsyms_names; 165 return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
190 /* This kernel should never had been booted. */ 210 /* This kernel should never had been booted. */
191 BUG_ON(!kallsyms_addresses); 211 BUG_ON(!kallsyms_addresses);
192 212
193 /* do a binary search on the sorted kallsyms_addresses array */ 213 /* Do a binary search on the sorted kallsyms_addresses array. */
194 low = 0; 214 low = 0;
195 high = kallsyms_num_syms; 215 high = kallsyms_num_syms;
196 216
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
203 } 223 }
204 224
205 /* 225 /*
206 * search for the first aliased symbol. Aliased 226 * Search for the first aliased symbol. Aliased
207 * symbols are symbols with the same address 227 * symbols are symbols with the same address.
208 */ 228 */
209 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) 229 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
210 --low; 230 --low;
211 231
212 symbol_start = kallsyms_addresses[low]; 232 symbol_start = kallsyms_addresses[low];
213 233
214 /* Search for next non-aliased symbol */ 234 /* Search for next non-aliased symbol. */
215 for (i = low + 1; i < kallsyms_num_syms; i++) { 235 for (i = low + 1; i < kallsyms_num_syms; i++) {
216 if (kallsyms_addresses[i] > symbol_start) { 236 if (kallsyms_addresses[i] > symbol_start) {
217 symbol_end = kallsyms_addresses[i]; 237 symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
219 } 239 }
220 } 240 }
221 241
222 /* if we found no next symbol, we use the end of the section */ 242 /* If we found no next symbol, we use the end of the section. */
223 if (!symbol_end) { 243 if (!symbol_end) {
224 if (is_kernel_inittext(addr)) 244 if (is_kernel_inittext(addr))
225 symbol_end = (unsigned long)_einittext; 245 symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
252 272
253/* 273/*
254 * Lookup an address 274 * Lookup an address
255 * - modname is set to NULL if it's in the kernel 275 * - modname is set to NULL if it's in the kernel.
256 * - we guarantee that the returned name is valid until we reschedule even if 276 * - We guarantee that the returned name is valid until we reschedule even if.
257 * it resides in a module 277 * It resides in a module.
258 * - we also guarantee that modname will be valid until rescheduled 278 * - We also guarantee that modname will be valid until rescheduled.
259 */ 279 */
260const char *kallsyms_lookup(unsigned long addr, 280const char *kallsyms_lookup(unsigned long addr,
261 unsigned long *symbolsize, 281 unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
276 return namebuf; 296 return namebuf;
277 } 297 }
278 298
279 /* see if it's in a module */ 299 /* See if it's in a module. */
280 return module_address_lookup(addr, symbolsize, offset, modname, 300 return module_address_lookup(addr, symbolsize, offset, modname,
281 namebuf); 301 namebuf);
282} 302}
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
294 kallsyms_expand_symbol(get_symbol_offset(pos), symname); 314 kallsyms_expand_symbol(get_symbol_offset(pos), symname);
295 return 0; 315 return 0;
296 } 316 }
297 /* see if it's in a module */ 317 /* See if it's in a module. */
298 return lookup_module_symbol_name(addr, symname); 318 return lookup_module_symbol_name(addr, symname);
299} 319}
300 320
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
313 modname[0] = '\0'; 333 modname[0] = '\0';
314 return 0; 334 return 0;
315 } 335 }
316 /* see if it's in a module */ 336 /* See if it's in a module. */
317 return lookup_module_symbol_attrs(addr, size, offset, modname, name); 337 return lookup_module_symbol_attrs(addr, size, offset, modname, name);
318} 338}
319 339
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
342 362
343 return len; 363 return len;
344} 364}
365EXPORT_SYMBOL_GPL(sprint_symbol);
345 366
346/* Look up a kernel symbol and print it to the kernel messages. */ 367/* Look up a kernel symbol and print it to the kernel messages. */
347void __print_symbol(const char *fmt, unsigned long address) 368void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
352 373
353 printk(fmt, buffer); 374 printk(fmt, buffer);
354} 375}
376EXPORT_SYMBOL(__print_symbol);
355 377
356/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ 378/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
357struct kallsym_iter 379struct kallsym_iter {
358{
359 loff_t pos; 380 loff_t pos;
360 unsigned long value; 381 unsigned long value;
361 unsigned int nameoff; /* If iterating in core kernel symbols */ 382 unsigned int nameoff; /* If iterating in core kernel symbols. */
362 char type; 383 char type;
363 char name[KSYM_NAME_LEN]; 384 char name[KSYM_NAME_LEN];
364 char module_name[MODULE_NAME_LEN]; 385 char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
404 iter->pos = pos; 425 iter->pos = pos;
405 return get_ksymbol_mod(iter); 426 return get_ksymbol_mod(iter);
406 } 427 }
407 428
408 /* If we're not on the desired position, reset to new position. */ 429 /* If we're not on the desired position, reset to new position. */
409 if (pos != iter->pos) 430 if (pos != iter->pos)
410 reset_iter(iter, pos); 431 reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
439{ 460{
440 struct kallsym_iter *iter = m->private; 461 struct kallsym_iter *iter = m->private;
441 462
442 /* Some debugging symbols have no name. Ignore them. */ 463 /* Some debugging symbols have no name. Ignore them. */
443 if (!iter->name[0]) 464 if (!iter->name[0])
444 return 0; 465 return 0;
445 466
446 if (iter->module_name[0]) { 467 if (iter->module_name[0]) {
447 char type; 468 char type;
448 469
449 /* Label it "global" if it is exported, 470 /*
450 * "local" if not exported. */ 471 * Label it "global" if it is exported,
472 * "local" if not exported.
473 */
451 type = iter->exported ? toupper(iter->type) : 474 type = iter->exported ? toupper(iter->type) :
452 tolower(iter->type); 475 tolower(iter->type);
453 seq_printf(m, "%0*lx %c %s\t[%s]\n", 476 seq_printf(m, "%0*lx %c %s\t[%s]\n",
454 (int)(2*sizeof(void*)), 477 (int)(2 * sizeof(void *)),
455 iter->value, type, iter->name, iter->module_name); 478 iter->value, type, iter->name, iter->module_name);
456 } else 479 } else
457 seq_printf(m, "%0*lx %c %s\n", 480 seq_printf(m, "%0*lx %c %s\n",
458 (int)(2*sizeof(void*)), 481 (int)(2 * sizeof(void *)),
459 iter->value, iter->type, iter->name); 482 iter->value, iter->type, iter->name);
460 return 0; 483 return 0;
461} 484}
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
469 492
470static int kallsyms_open(struct inode *inode, struct file *file) 493static int kallsyms_open(struct inode *inode, struct file *file)
471{ 494{
472 /* We keep iterator in m->private, since normal case is to 495 /*
496 * We keep iterator in m->private, since normal case is to
473 * s_start from where we left off, so we avoid doing 497 * s_start from where we left off, so we avoid doing
474 * using get_symbol_offset for every symbol */ 498 * using get_symbol_offset for every symbol.
499 */
475 struct kallsym_iter *iter; 500 struct kallsym_iter *iter;
476 int ret; 501 int ret;
477 502
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
500 proc_create("kallsyms", 0444, NULL, &kallsyms_operations); 525 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
501 return 0; 526 return 0;
502} 527}
503__initcall(kallsyms_init); 528device_initcall(kallsyms_init);
504
505EXPORT_SYMBOL(__print_symbol);
506EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
72 72
73 /* 73 /*
74 * round up to the next power of 2, since our 'let the indices 74 * round up to the next power of 2, since our 'let the indices
75 * wrap' tachnique works only in this case. 75 * wrap' technique works only in this case.
76 */ 76 */
77 if (size & (size - 1)) { 77 if (!is_power_of_2(size)) {
78 BUG_ON(size > 0x80000000); 78 BUG_ON(size > 0x80000000);
79 size = roundup_pow_of_two(size); 79 size = roundup_pow_of_two(size);
80 } 80 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..7fa441333529 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
236 ignore_signals(tsk); 237 ignore_signals(tsk);
237 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 238 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
238 set_cpus_allowed_ptr(tsk, cpu_all_mask); 239 set_cpus_allowed_ptr(tsk, cpu_all_mask);
240 set_mems_allowed(node_possible_map);
239 241
240 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 242 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
241 243
diff --git a/kernel/module.c b/kernel/module.c
index e4ab36ce7672..215aaab09e91 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2899,7 +2899,7 @@ void print_modules(void)
2899 struct module *mod; 2899 struct module *mod;
2900 char buf[8]; 2900 char buf[8];
2901 2901
2902 printk("Modules linked in:"); 2902 printk(KERN_DEFAULT "Modules linked in:");
2903 /* Most callers should already have preempt disabled, but make sure */ 2903 /* Most callers should already have preempt disabled, but make sure */
2904 preempt_disable(); 2904 preempt_disable();
2905 list_for_each_entry_rcu(mod, &modules, list) 2905 list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b5..e8b337006276 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "powerOff",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int pm_sysrq_init(void) 40static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
117 if (error) 117 if (error)
118 goto Exit; 118 goto Exit;
119 printk("done."); 119 printk("done.");
120
121 oom_killer_disable();
120 Exit: 122 Exit:
121 BUG_ON(in_atomic()); 123 BUG_ON(in_atomic());
122 printk("\n"); 124 printk("\n");
125
123 return error; 126 return error;
124} 127}
125 128
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
145 148
146void thaw_processes(void) 149void thaw_processes(void)
147{ 150{
151 oom_killer_enable();
152
148 printk("Restarting tasks ... "); 153 printk("Restarting tasks ... ");
149 thaw_tasks(true); 154 thaw_tasks(true);
150 thaw_tasks(false); 155 thaw_tasks(false);
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c67..b4d97b54c1ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
687 sizeof(printk_buf) - printed_len, fmt, args); 687 sizeof(printk_buf) - printed_len, fmt, args);
688 688
689 689
690 p = printk_buf;
691
692 /* Do we have a loglevel in the string? */
693 if (p[0] == '<') {
694 unsigned char c = p[1];
695 if (c && p[2] == '>') {
696 switch (c) {
697 case '0' ... '7': /* loglevel */
698 current_log_level = c - '0';
699 /* Fallthrough - make sure we're on a new line */
700 case 'd': /* KERN_DEFAULT */
701 if (!new_text_line) {
702 emit_log_char('\n');
703 new_text_line = 1;
704 }
705 /* Fallthrough - skip the loglevel */
706 case 'c': /* KERN_CONT */
707 p += 3;
708 break;
709 }
710 }
711 }
712
690 /* 713 /*
691 * Copy the output into log_buf. If the caller didn't provide 714 * Copy the output into log_buf. If the caller didn't provide
692 * appropriate log level tags, we insert them here 715 * appropriate log level tags, we insert them here
693 */ 716 */
694 for (p = printk_buf; *p; p++) { 717 for ( ; *p; p++) {
695 if (new_text_line) { 718 if (new_text_line) {
696 /* If a token, set current_log_level and skip over */
697 if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
698 p[2] == '>') {
699 current_log_level = p[1] - '0';
700 p += 3;
701 printed_len -= 3;
702 }
703
704 /* Always output the token */ 719 /* Always output the token */
705 emit_log_char('<'); 720 emit_log_char('<');
706 emit_log_char(current_log_level + '0'); 721 emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26ad2d24..69911b5745eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
365 node = cpu_to_node(cpu); 365 node = cpu_to_node(cpu);
366 per_cpu(cpu_profile_flip, cpu) = 0; 366 per_cpu(cpu_profile_flip, cpu) = 0;
367 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 367 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
368 page = alloc_pages_node(node, 368 page = alloc_pages_exact_node(node,
369 GFP_KERNEL | __GFP_ZERO, 369 GFP_KERNEL | __GFP_ZERO,
370 0); 370 0);
371 if (!page) 371 if (!page)
@@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
373 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 373 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
374 } 374 }
375 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 375 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
376 page = alloc_pages_node(node, 376 page = alloc_pages_exact_node(node,
377 GFP_KERNEL | __GFP_ZERO, 377 GFP_KERNEL | __GFP_ZERO,
378 0); 378 0);
379 if (!page) 379 if (!page)
@@ -564,14 +564,14 @@ static int create_hash_tables(void)
564 int node = cpu_to_node(cpu); 564 int node = cpu_to_node(cpu);
565 struct page *page; 565 struct page *page;
566 566
567 page = alloc_pages_node(node, 567 page = alloc_pages_exact_node(node,
568 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 568 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
569 0); 569 0);
570 if (!page) 570 if (!page)
571 goto out_cleanup; 571 goto out_cleanup;
572 per_cpu(cpu_profile_hits, cpu)[1] 572 per_cpu(cpu_profile_hits, cpu)[1]
573 = (struct profile_hit *)page_address(page); 573 = (struct profile_hit *)page_address(page);
574 page = alloc_pages_node(node, 574 page = alloc_pages_exact_node(node,
575 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 575 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
576 0); 576 0);
577 if (!page) 577 if (!page)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 820c5af44f3e..fcd107a78c5a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -902,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
902 * Returns: 902 * Returns:
903 * 0 on success 903 * 0 on success
904 * -EINTR when interrupted by a signal 904 * -EINTR when interrupted by a signal
905 * -ETIMEOUT when the timeout expired 905 * -ETIMEDOUT when the timeout expired
906 * -EDEADLK when the lock would deadlock (when deadlock detection is on) 906 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
907 */ 907 */
908int 908int
diff --git a/kernel/sched.c b/kernel/sched.c
index 8ec9d13140be..8fb88a906aaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -240,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
240 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 240 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
241 delta = ktime_to_ns(ktime_sub(hard, soft)); 241 delta = ktime_to_ns(ktime_sub(hard, soft));
242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
243 HRTIMER_MODE_ABS, 0); 243 HRTIMER_MODE_ABS_PINNED, 0);
244 } 244 }
245 spin_unlock(&rt_b->rt_runtime_lock); 245 spin_unlock(&rt_b->rt_runtime_lock);
246} 246}
@@ -1155,7 +1155,7 @@ static __init void init_hrtick(void)
1155static void hrtick_start(struct rq *rq, u64 delay) 1155static void hrtick_start(struct rq *rq, u64 delay)
1156{ 1156{
1157 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1157 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1158 HRTIMER_MODE_REL, 0); 1158 HRTIMER_MODE_REL_PINNED, 0);
1159} 1159}
1160 1160
1161static inline void init_hrtick(void) 1161static inline void init_hrtick(void)
@@ -4397,6 +4397,11 @@ static struct {
4397 .load_balancer = ATOMIC_INIT(-1), 4397 .load_balancer = ATOMIC_INIT(-1),
4398}; 4398};
4399 4399
4400int get_nohz_load_balancer(void)
4401{
4402 return atomic_read(&nohz.load_balancer);
4403}
4404
4400#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4405#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4401/** 4406/**
4402 * lowest_flag_domain - Return lowest sched_domain containing flag. 4407 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -9029,6 +9034,8 @@ void __init sched_init_smp(void)
9029} 9034}
9030#endif /* CONFIG_SMP */ 9035#endif /* CONFIG_SMP */
9031 9036
9037const_debug unsigned int sysctl_timer_migration = 1;
9038
9032int in_sched_functions(unsigned long addr) 9039int in_sched_functions(unsigned long addr)
9033{ 9040{
9034 return in_lock_functions(addr) || 9041 return in_lock_functions(addr) ||
diff --git a/kernel/signal.c b/kernel/signal.c
index 809a228019ad..d81f4952eebb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
832{ 832{
833 struct sigpending *pending; 833 struct sigpending *pending;
834 struct sigqueue *q; 834 struct sigqueue *q;
835 int override_rlimit;
835 836
836 trace_sched_signal_send(sig, t); 837 trace_sched_signal_send(sig, t);
837 838
@@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
863 make sure at least one signal gets delivered and don't 864 make sure at least one signal gets delivered and don't
864 pass on the info struct. */ 865 pass on the info struct. */
865 866
866 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 867 if (sig < SIGRTMIN)
867 (is_si_special(info) || 868 override_rlimit = (is_si_special(info) || info->si_code >= 0);
868 info->si_code >= 0))); 869 else
870 override_rlimit = 0;
871
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit);
869 if (q) { 874 if (q) {
870 list_add_tail(&q->list, &pending->list); 875 list_add_tail(&q->list, &pending->list);
871 switch ((unsigned long) info) { 876 switch ((unsigned long) info) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 521ed2004d63..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
319EXPORT_SYMBOL(slow_work_enqueue); 319EXPORT_SYMBOL(slow_work_enqueue);
320 320
321/* 321/*
322 * Schedule a cull of the thread pool at some time in the near future
323 */
324static void slow_work_schedule_cull(void)
325{
326 mod_timer(&slow_work_cull_timer,
327 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
328}
329
330/*
322 * Worker thread culling algorithm 331 * Worker thread culling algorithm
323 */ 332 */
324static bool slow_work_cull_thread(void) 333static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
335 list_empty(&vslow_work_queue) && 344 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) > 345 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) { 346 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer, 347 slow_work_schedule_cull();
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true; 348 do_cull = true;
341 } 349 }
342 } 350 }
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
393 list_empty(&vslow_work_queue) && 401 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) > 402 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads) 403 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer, 404 slow_work_schedule_cull();
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue; 405 continue;
399 } 406 }
400 407
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
458 if (atomic_dec_and_test(&slow_work_thread_count)) 465 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */ 466 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer, 467 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT); 468 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
462 } else { 469 } else {
463 /* ratelimit the starting of new threads */ 470 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1); 471 mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
502 if (n < 0 && !slow_work_may_not_start_new_thread) 509 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread); 510 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0) 511 else if (n > 0)
505 mod_timer(&slow_work_cull_timer, 512 slow_work_schedule_cull();
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 } 513 }
508 mutex_unlock(&slow_work_user_lock); 514 mutex_unlock(&slow_work_user_lock);
509 } 515 }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
529 atomic_read(&slow_work_thread_count); 535 atomic_read(&slow_work_thread_count);
530 536
531 if (n < 0) 537 if (n < 0)
532 mod_timer(&slow_work_cull_timer, 538 slow_work_schedule_cull();
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 } 539 }
535 mutex_unlock(&slow_work_user_lock); 540 mutex_unlock(&slow_work_user_lock);
536 } 541 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543db..b41fb710e114 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 382
383EXPORT_SYMBOL(__tasklet_hi_schedule); 383EXPORT_SYMBOL(__tasklet_hi_schedule);
384 384
385void __tasklet_hi_schedule_first(struct tasklet_struct *t)
386{
387 BUG_ON(!irqs_disabled());
388
389 t->next = __get_cpu_var(tasklet_hi_vec).head;
390 __get_cpu_var(tasklet_hi_vec).head = t;
391 __raise_softirq_irqoff(HI_SOFTIRQ);
392}
393
394EXPORT_SYMBOL(__tasklet_hi_schedule_first);
395
385static void tasklet_action(struct softirq_action *a) 396static void tasklet_action(struct softirq_action *a)
386{ 397{
387 struct tasklet_struct *list; 398 struct tasklet_struct *list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 438d99a38c87..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1113,289 +1113,6 @@ out:
1113 return err; 1113 return err;
1114} 1114}
1115 1115
1116/*
1117 * Supplementary group IDs
1118 */
1119
1120/* init to 2 - one for init_task, one to ensure it is never freed */
1121struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1122
1123struct group_info *groups_alloc(int gidsetsize)
1124{
1125 struct group_info *group_info;
1126 int nblocks;
1127 int i;
1128
1129 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1130 /* Make sure we always allocate at least one indirect block pointer */
1131 nblocks = nblocks ? : 1;
1132 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1133 if (!group_info)
1134 return NULL;
1135 group_info->ngroups = gidsetsize;
1136 group_info->nblocks = nblocks;
1137 atomic_set(&group_info->usage, 1);
1138
1139 if (gidsetsize <= NGROUPS_SMALL)
1140 group_info->blocks[0] = group_info->small_block;
1141 else {
1142 for (i = 0; i < nblocks; i++) {
1143 gid_t *b;
1144 b = (void *)__get_free_page(GFP_USER);
1145 if (!b)
1146 goto out_undo_partial_alloc;
1147 group_info->blocks[i] = b;
1148 }
1149 }
1150 return group_info;
1151
1152out_undo_partial_alloc:
1153 while (--i >= 0) {
1154 free_page((unsigned long)group_info->blocks[i]);
1155 }
1156 kfree(group_info);
1157 return NULL;
1158}
1159
1160EXPORT_SYMBOL(groups_alloc);
1161
1162void groups_free(struct group_info *group_info)
1163{
1164 if (group_info->blocks[0] != group_info->small_block) {
1165 int i;
1166 for (i = 0; i < group_info->nblocks; i++)
1167 free_page((unsigned long)group_info->blocks[i]);
1168 }
1169 kfree(group_info);
1170}
1171
1172EXPORT_SYMBOL(groups_free);
1173
1174/* export the group_info to a user-space array */
1175static int groups_to_user(gid_t __user *grouplist,
1176 const struct group_info *group_info)
1177{
1178 int i;
1179 unsigned int count = group_info->ngroups;
1180
1181 for (i = 0; i < group_info->nblocks; i++) {
1182 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1183 unsigned int len = cp_count * sizeof(*grouplist);
1184
1185 if (copy_to_user(grouplist, group_info->blocks[i], len))
1186 return -EFAULT;
1187
1188 grouplist += NGROUPS_PER_BLOCK;
1189 count -= cp_count;
1190 }
1191 return 0;
1192}
1193
1194/* fill a group_info from a user-space array - it must be allocated already */
1195static int groups_from_user(struct group_info *group_info,
1196 gid_t __user *grouplist)
1197{
1198 int i;
1199 unsigned int count = group_info->ngroups;
1200
1201 for (i = 0; i < group_info->nblocks; i++) {
1202 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1203 unsigned int len = cp_count * sizeof(*grouplist);
1204
1205 if (copy_from_user(group_info->blocks[i], grouplist, len))
1206 return -EFAULT;
1207
1208 grouplist += NGROUPS_PER_BLOCK;
1209 count -= cp_count;
1210 }
1211 return 0;
1212}
1213
1214/* a simple Shell sort */
1215static void groups_sort(struct group_info *group_info)
1216{
1217 int base, max, stride;
1218 int gidsetsize = group_info->ngroups;
1219
1220 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1221 ; /* nothing */
1222 stride /= 3;
1223
1224 while (stride) {
1225 max = gidsetsize - stride;
1226 for (base = 0; base < max; base++) {
1227 int left = base;
1228 int right = left + stride;
1229 gid_t tmp = GROUP_AT(group_info, right);
1230
1231 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1232 GROUP_AT(group_info, right) =
1233 GROUP_AT(group_info, left);
1234 right = left;
1235 left -= stride;
1236 }
1237 GROUP_AT(group_info, right) = tmp;
1238 }
1239 stride /= 3;
1240 }
1241}
1242
1243/* a simple bsearch */
1244int groups_search(const struct group_info *group_info, gid_t grp)
1245{
1246 unsigned int left, right;
1247
1248 if (!group_info)
1249 return 0;
1250
1251 left = 0;
1252 right = group_info->ngroups;
1253 while (left < right) {
1254 unsigned int mid = (left+right)/2;
1255 int cmp = grp - GROUP_AT(group_info, mid);
1256 if (cmp > 0)
1257 left = mid + 1;
1258 else if (cmp < 0)
1259 right = mid;
1260 else
1261 return 1;
1262 }
1263 return 0;
1264}
1265
1266/**
1267 * set_groups - Change a group subscription in a set of credentials
1268 * @new: The newly prepared set of credentials to alter
1269 * @group_info: The group list to install
1270 *
1271 * Validate a group subscription and, if valid, insert it into a set
1272 * of credentials.
1273 */
1274int set_groups(struct cred *new, struct group_info *group_info)
1275{
1276 int retval;
1277
1278 retval = security_task_setgroups(group_info);
1279 if (retval)
1280 return retval;
1281
1282 put_group_info(new->group_info);
1283 groups_sort(group_info);
1284 get_group_info(group_info);
1285 new->group_info = group_info;
1286 return 0;
1287}
1288
1289EXPORT_SYMBOL(set_groups);
1290
1291/**
1292 * set_current_groups - Change current's group subscription
1293 * @group_info: The group list to impose
1294 *
1295 * Validate a group subscription and, if valid, impose it upon current's task
1296 * security record.
1297 */
1298int set_current_groups(struct group_info *group_info)
1299{
1300 struct cred *new;
1301 int ret;
1302
1303 new = prepare_creds();
1304 if (!new)
1305 return -ENOMEM;
1306
1307 ret = set_groups(new, group_info);
1308 if (ret < 0) {
1309 abort_creds(new);
1310 return ret;
1311 }
1312
1313 return commit_creds(new);
1314}
1315
1316EXPORT_SYMBOL(set_current_groups);
1317
1318SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1319{
1320 const struct cred *cred = current_cred();
1321 int i;
1322
1323 if (gidsetsize < 0)
1324 return -EINVAL;
1325
1326 /* no need to grab task_lock here; it cannot change */
1327 i = cred->group_info->ngroups;
1328 if (gidsetsize) {
1329 if (i > gidsetsize) {
1330 i = -EINVAL;
1331 goto out;
1332 }
1333 if (groups_to_user(grouplist, cred->group_info)) {
1334 i = -EFAULT;
1335 goto out;
1336 }
1337 }
1338out:
1339 return i;
1340}
1341
1342/*
1343 * SMP: Our groups are copy-on-write. We can set them safely
1344 * without another task interfering.
1345 */
1346
1347SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1348{
1349 struct group_info *group_info;
1350 int retval;
1351
1352 if (!capable(CAP_SETGID))
1353 return -EPERM;
1354 if ((unsigned)gidsetsize > NGROUPS_MAX)
1355 return -EINVAL;
1356
1357 group_info = groups_alloc(gidsetsize);
1358 if (!group_info)
1359 return -ENOMEM;
1360 retval = groups_from_user(group_info, grouplist);
1361 if (retval) {
1362 put_group_info(group_info);
1363 return retval;
1364 }
1365
1366 retval = set_current_groups(group_info);
1367 put_group_info(group_info);
1368
1369 return retval;
1370}
1371
1372/*
1373 * Check whether we're fsgid/egid or in the supplemental group..
1374 */
1375int in_group_p(gid_t grp)
1376{
1377 const struct cred *cred = current_cred();
1378 int retval = 1;
1379
1380 if (grp != cred->fsgid)
1381 retval = groups_search(cred->group_info, grp);
1382 return retval;
1383}
1384
1385EXPORT_SYMBOL(in_group_p);
1386
1387int in_egroup_p(gid_t grp)
1388{
1389 const struct cred *cred = current_cred();
1390 int retval = 1;
1391
1392 if (grp != cred->egid)
1393 retval = groups_search(cred->group_info, grp);
1394 return retval;
1395}
1396
1397EXPORT_SYMBOL(in_egroup_p);
1398
1399DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1400 1117
1401SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce664f98e3fb..ab462b9968d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/fs.h> 32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
@@ -328,6 +329,14 @@ static struct ctl_table kern_table[] = {
328 .mode = 0644, 329 .mode = 0644,
329 .proc_handler = &proc_dointvec, 330 .proc_handler = &proc_dointvec,
330 }, 331 },
332 {
333 .ctl_name = CTL_UNNUMBERED,
334 .procname = "timer_migration",
335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = &proc_dointvec,
339 },
331#endif 340#endif
332 { 341 {
333 .ctl_name = CTL_UNNUMBERED, 342 .ctl_name = CTL_UNNUMBERED,
@@ -959,6 +968,17 @@ static struct ctl_table kern_table[] = {
959 .proc_handler = &proc_dointvec, 968 .proc_handler = &proc_dointvec,
960 }, 969 },
961#endif 970#endif
971#ifdef CONFIG_KMEMCHECK
972 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "kmemcheck",
975 .data = &kmemcheck_enabled,
976 .maxlen = sizeof(int),
977 .mode = 0644,
978 .proc_handler = &proc_dointvec,
979 },
980#endif
981
962/* 982/*
963 * NOTE: do not add new entries to this table unless you have read 983 * NOTE: do not add new entries to this table unless you have read
964 * Documentation/sysctl/ctl_unnumbered.txt 984 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1317,7 +1337,6 @@ static struct ctl_table vm_table[] = {
1317 .extra2 = &one, 1337 .extra2 = &one,
1318 }, 1338 },
1319#endif 1339#endif
1320#ifdef CONFIG_UNEVICTABLE_LRU
1321 { 1340 {
1322 .ctl_name = CTL_UNNUMBERED, 1341 .ctl_name = CTL_UNNUMBERED,
1323 .procname = "scan_unevictable_pages", 1342 .procname = "scan_unevictable_pages",
@@ -1326,7 +1345,6 @@ static struct ctl_table vm_table[] = {
1326 .mode = 0644, 1345 .mode = 0644,
1327 .proc_handler = &scan_unevictable_handler, 1346 .proc_handler = &scan_unevictable_handler,
1328 }, 1347 },
1329#endif
1330/* 1348/*
1331 * NOTE: do not add new entries to this table unless you have read 1349 * NOTE: do not add new entries to this table unless you have read
1332 * Documentation/sysctl/ctl_unnumbered.txt 1350 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a790..1ad6dd461119 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
21 22
22/* The registered clock event devices */ 23/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
54 55
55 return (unsigned long) clc; 56 return (unsigned long) clc;
56} 57}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns);
57 59
58/** 60/**
59 * clockevents_set_mode - set the operating mode of a clock event device 61 * clockevents_set_mode - set the operating mode of a clock event device
@@ -187,6 +189,7 @@ void clockevents_register_device(struct clock_event_device *dev)
187 189
188 spin_unlock(&clockevents_lock); 190 spin_unlock(&clockevents_lock);
189} 191}
192EXPORT_SYMBOL_GPL(clockevents_register_device);
190 193
191/* 194/*
192 * Noop handler when we shut down an event device 195 * Noop handler when we shut down an event device
@@ -251,4 +254,15 @@ void clockevents_notify(unsigned long reason, void *arg)
251 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
252} 255}
253EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
254#endif 268#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 80189f6f1c5a..592bf584d1d2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -509,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
509 } 509 }
510 } 510 }
511 511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
512 /* Reselect, when the override name has changed */ 524 /* Reselect, when the override name has changed */
513 if (ovr != clocksource_override) { 525 if (ovr != clocksource_override) {
514 clocksource_override = ovr; 526 clocksource_override = ovr;
@@ -537,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
537 549
538 spin_lock_irq(&clocksource_lock); 550 spin_lock_irq(&clocksource_lock);
539 list_for_each_entry(src, &clocksource_list, list) { 551 list_for_each_entry(src, &clocksource_list, list) {
540 count += snprintf(buf + count, 552 /*
553 * Don't show non-HRES clocksource if the tick code is
554 * in one shot mode (highres=on or nohz=on)
555 */
556 if (!tick_oneshot_mode_active() ||
557 (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
558 count += snprintf(buf + count,
541 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
542 "%s ", src->name); 560 "%s ", src->name);
543 } 561 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9a..877dbedc3118 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
27 * timer stops in C3 state. 27 * timer stops in C3 state.
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e767..a96c0e2b89cf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
128 return 0; 128 return 0;
129} 129}
130 130
131/**
132 * tick_check_oneshot_mode - check whether the system is in oneshot mode
133 *
134 * returns 1 when either nohz or highres are enabled. otherwise 0.
135 */
136int tick_oneshot_mode_active(void)
137{
138 unsigned long flags;
139 int ret;
140
141 local_irq_save(flags);
142 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
143 local_irq_restore(flags);
144
145 return ret;
146}
147
131#ifdef CONFIG_HIGH_RES_TIMERS 148#ifdef CONFIG_HIGH_RES_TIMERS
132/** 149/**
133 * tick_init_highres - switch to high resolution mode 150 * tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cbe..2aff39c6f10c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -349,7 +349,7 @@ void tick_nohz_stop_sched_tick(int inidle)
349 349
350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
351 hrtimer_start(&ts->sched_timer, expires, 351 hrtimer_start(&ts->sched_timer, expires,
352 HRTIMER_MODE_ABS); 352 HRTIMER_MODE_ABS_PINNED);
353 /* Check, if the timer was already in the past */ 353 /* Check, if the timer was already in the past */
354 if (hrtimer_active(&ts->sched_timer)) 354 if (hrtimer_active(&ts->sched_timer))
355 goto out; 355 goto out;
@@ -395,7 +395,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
395 395
396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
397 hrtimer_start_expires(&ts->sched_timer, 397 hrtimer_start_expires(&ts->sched_timer,
398 HRTIMER_MODE_ABS); 398 HRTIMER_MODE_ABS_PINNED);
399 /* Check, if the timer was already in the past */ 399 /* Check, if the timer was already in the past */
400 if (hrtimer_active(&ts->sched_timer)) 400 if (hrtimer_active(&ts->sched_timer))
401 break; 401 break;
@@ -698,7 +698,8 @@ void tick_setup_sched_timer(void)
698 698
699 for (;;) { 699 for (;;) {
700 hrtimer_forward(&ts->sched_timer, now, tick_period); 700 hrtimer_forward(&ts->sched_timer, now, tick_period);
701 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); 701 hrtimer_start_expires(&ts->sched_timer,
702 HRTIMER_MODE_ABS_PINNED);
702 /* Check, if the timer was already in the past */ 703 /* Check, if the timer was already in the past */
703 if (hrtimer_active(&ts->sched_timer)) 704 if (hrtimer_active(&ts->sched_timer))
704 break; 705 break;
diff --git a/kernel/timer.c b/kernel/timer.c
index c01e568935ea..54d3912f8cad 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -38,6 +38,7 @@
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_counter.h>
41#include <linux/sched.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/unistd.h> 44#include <asm/unistd.h>
@@ -605,13 +606,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
605} 606}
606 607
607static inline int 608static inline int
608__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) 609__mod_timer(struct timer_list *timer, unsigned long expires,
610 bool pending_only, int pinned)
609{ 611{
610 struct tvec_base *base, *new_base; 612 struct tvec_base *base, *new_base;
611 unsigned long flags; 613 unsigned long flags;
612 int ret; 614 int ret = 0 , cpu;
613
614 ret = 0;
615 615
616 timer_stats_timer_set_start_info(timer); 616 timer_stats_timer_set_start_info(timer);
617 BUG_ON(!timer->function); 617 BUG_ON(!timer->function);
@@ -630,6 +630,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
630 630
631 new_base = __get_cpu_var(tvec_bases); 631 new_base = __get_cpu_var(tvec_bases);
632 632
633 cpu = smp_processor_id();
634
635#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
636 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
637 int preferred_cpu = get_nohz_load_balancer();
638
639 if (preferred_cpu >= 0)
640 cpu = preferred_cpu;
641 }
642#endif
643 new_base = per_cpu(tvec_bases, cpu);
644
633 if (base != new_base) { 645 if (base != new_base) {
634 /* 646 /*
635 * We are trying to schedule the timer on the local CPU. 647 * We are trying to schedule the timer on the local CPU.
@@ -669,7 +681,7 @@ out_unlock:
669 */ 681 */
670int mod_timer_pending(struct timer_list *timer, unsigned long expires) 682int mod_timer_pending(struct timer_list *timer, unsigned long expires)
671{ 683{
672 return __mod_timer(timer, expires, true); 684 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
673} 685}
674EXPORT_SYMBOL(mod_timer_pending); 686EXPORT_SYMBOL(mod_timer_pending);
675 687
@@ -703,11 +715,33 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
703 if (timer->expires == expires && timer_pending(timer)) 715 if (timer->expires == expires && timer_pending(timer))
704 return 1; 716 return 1;
705 717
706 return __mod_timer(timer, expires, false); 718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
707} 719}
708EXPORT_SYMBOL(mod_timer); 720EXPORT_SYMBOL(mod_timer);
709 721
710/** 722/**
723 * mod_timer_pinned - modify a timer's timeout
724 * @timer: the timer to be modified
725 * @expires: new timeout in jiffies
726 *
727 * mod_timer_pinned() is a way to update the expire field of an
728 * active timer (if the timer is inactive it will be activated)
729 * and not allow the timer to be migrated to a different CPU.
730 *
731 * mod_timer_pinned(timer, expires) is equivalent to:
732 *
733 * del_timer(timer); timer->expires = expires; add_timer(timer);
734 */
735int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
736{
737 if (timer->expires == expires && timer_pending(timer))
738 return 1;
739
740 return __mod_timer(timer, expires, false, TIMER_PINNED);
741}
742EXPORT_SYMBOL(mod_timer_pinned);
743
744/**
711 * add_timer - start a timer 745 * add_timer - start a timer
712 * @timer: the timer to be added 746 * @timer: the timer to be added
713 * 747 *
@@ -757,6 +791,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
757 wake_up_idle_cpu(cpu); 791 wake_up_idle_cpu(cpu);
758 spin_unlock_irqrestore(&base->lock, flags); 792 spin_unlock_irqrestore(&base->lock, flags);
759} 793}
794EXPORT_SYMBOL_GPL(add_timer_on);
760 795
761/** 796/**
762 * del_timer - deactive a timer. 797 * del_timer - deactive a timer.
@@ -1016,6 +1051,9 @@ cascade:
1016 index = slot = timer_jiffies & TVN_MASK; 1051 index = slot = timer_jiffies & TVN_MASK;
1017 do { 1052 do {
1018 list_for_each_entry(nte, varp->vec + slot, entry) { 1053 list_for_each_entry(nte, varp->vec + slot, entry) {
1054 if (tbase_get_deferrable(nte->base))
1055 continue;
1056
1019 found = 1; 1057 found = 1;
1020 if (time_before(nte->expires, expires)) 1058 if (time_before(nte->expires, expires))
1021 expires = nte->expires; 1059 expires = nte->expires;
@@ -1306,7 +1344,7 @@ signed long __sched schedule_timeout(signed long timeout)
1306 expire = timeout + jiffies; 1344 expire = timeout + jiffies;
1307 1345
1308 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1346 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1309 __mod_timer(&timer, expire, false); 1347 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1310 schedule(); 1348 schedule();
1311 del_singleshot_timer_sync(&timer); 1349 del_singleshot_timer_sync(&timer);
1312 1350
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4a13e5a01ce3..61071fecc82e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -147,7 +147,7 @@ config IRQSOFF_TRACER
147 disabled by default and can be runtime (re-)started 147 disabled by default and can be runtime (re-)started
148 via: 148 via:
149 149
150 echo 0 > /debugfs/tracing/tracing_max_latency 150 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
151 151
152 (Note that kernel size and overhead increases with this option 152 (Note that kernel size and overhead increases with this option
153 enabled. This option and the preempt-off timing option can be 153 enabled. This option and the preempt-off timing option can be
@@ -168,7 +168,7 @@ config PREEMPT_TRACER
168 disabled by default and can be runtime (re-)started 168 disabled by default and can be runtime (re-)started
169 via: 169 via:
170 170
171 echo 0 > /debugfs/tracing/tracing_max_latency 171 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
172 172
173 (Note that kernel size and overhead increases with this option 173 (Note that kernel size and overhead increases with this option
174 enabled. This option and the irqs-off timing option can be 174 enabled. This option and the irqs-off timing option can be
@@ -261,7 +261,7 @@ config PROFILE_ANNOTATED_BRANCHES
261 This tracer profiles all the the likely and unlikely macros 261 This tracer profiles all the the likely and unlikely macros
262 in the kernel. It will display the results in: 262 in the kernel. It will display the results in:
263 263
264 /debugfs/tracing/profile_annotated_branch 264 /sys/kernel/debug/tracing/profile_annotated_branch
265 265
266 Note: this will add a significant overhead, only turn this 266 Note: this will add a significant overhead, only turn this
267 on if you need to profile the system's use of these macros. 267 on if you need to profile the system's use of these macros.
@@ -274,7 +274,7 @@ config PROFILE_ALL_BRANCHES
274 taken in the kernel is recorded whether it hit or miss. 274 taken in the kernel is recorded whether it hit or miss.
275 The results will be displayed in: 275 The results will be displayed in:
276 276
277 /debugfs/tracing/profile_branch 277 /sys/kernel/debug/tracing/profile_branch
278 278
279 This option also enables the likely/unlikely profiler. 279 This option also enables the likely/unlikely profiler.
280 280
@@ -323,7 +323,7 @@ config STACK_TRACER
323 select KALLSYMS 323 select KALLSYMS
324 help 324 help
325 This special tracer records the maximum stack footprint of the 325 This special tracer records the maximum stack footprint of the
326 kernel and displays it in debugfs/tracing/stack_trace. 326 kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
327 327
328 This tracer works by hooking into every function call that the 328 This tracer works by hooking into every function call that the
329 kernel executes, and keeping a maximum stack depth value and 329 kernel executes, and keeping a maximum stack depth value and
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2e642b2b7253..dc4dc70171ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -1270,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1270 if (tail < BUF_PAGE_SIZE) { 1271 if (tail < BUF_PAGE_SIZE) {
1271 /* Mark the rest of the page with padding */ 1272 /* Mark the rest of the page with padding */
1272 event = __rb_page_index(tail_page, tail); 1273 event = __rb_page_index(tail_page, tail);
1274 kmemcheck_annotate_bitfield(event, bitfield);
1273 rb_event_set_padding(event); 1275 rb_event_set_padding(event);
1274 } 1276 }
1275 1277
@@ -1327,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1327 return NULL; 1329 return NULL;
1328 1330
1329 event = __rb_page_index(tail_page, tail); 1331 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield);
1330 rb_update_event(event, type, length); 1333 rb_update_event(event, type, length);
1331 1334
1332 /* The passed in type is zero for DATA */ 1335 /* The passed in type is zero for DATA */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8acd9b81a5d7..c1878bfb2e1e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
344/* 344/*
345 * Copy the new maximum trace into the separate maximum-trace 345 * Copy the new maximum trace into the separate maximum-trace
346 * structure. (this way the maximum trace is permanently saved, 346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /debugfs/tracing/latency_trace) 347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */ 348 */
349static void 349static void
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2414,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
2414 2414
2415static const char readme_msg[] = 2415static const char readme_msg[] =
2416 "tracing mini-HOWTO:\n\n" 2416 "tracing mini-HOWTO:\n\n"
2417 "# mkdir /debug\n" 2417 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2418 "# mount -t debugfs nodev /debug\n\n" 2418 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2419 "# cat /debug/tracing/available_tracers\n"
2420 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2419 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2421 "# cat /debug/tracing/current_tracer\n" 2420 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2422 "nop\n" 2421 "nop\n"
2423 "# echo sched_switch > /debug/tracing/current_tracer\n" 2422 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
2424 "# cat /debug/tracing/current_tracer\n" 2423 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2425 "sched_switch\n" 2424 "sched_switch\n"
2426 "# cat /debug/tracing/trace_options\n" 2425 "# cat /sys/kernel/debug/tracing/trace_options\n"
2427 "noprint-parent nosym-offset nosym-addr noverbose\n" 2426 "noprint-parent nosym-offset nosym-addr noverbose\n"
2428 "# echo print-parent > /debug/tracing/trace_options\n" 2427 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2429 "# echo 1 > /debug/tracing/tracing_enabled\n" 2428 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
2430 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2429 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2431 "# echo 0 > /debug/tracing/tracing_enabled\n" 2430 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
2432; 2431;
2433 2432
2434static ssize_t 2433static ssize_t
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index e04b76cc238a..f6693969287d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 205
206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
207 HRTIMER_MODE_REL_PINNED);
207} 208}
208 209
209static void start_stack_timers(void) 210static void start_stack_timers(void)
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 75 put_user_ns(up->user_ns);
76} 76}
77 77
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
79{
80 struct user_struct *user;
81 struct hlist_node *h;
82
83 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) {
85 atomic_inc(&user->__count);
86 return user;
87 }
88 }
89
90 return NULL;
91}
92
93#ifdef CONFIG_USER_SCHED 78#ifdef CONFIG_USER_SCHED
94 79
95static void sched_destroy_user(struct user_struct *up) 80static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
119 104
120#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) 105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
121 106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
122static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
123static DEFINE_MUTEX(uids_mutex); 125static DEFINE_MUTEX(uids_mutex);
124 126
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
283 return uids_user_create(&root_user); 285 return uids_user_create(&root_user);
284} 286}
285 287
286/* work function to remove sysfs directory for a user and free up 288/* delayed work function to remove sysfs directory for a user and free up
287 * corresponding structures. 289 * corresponding structures.
288 */ 290 */
289static void cleanup_user_struct(struct work_struct *w) 291static void cleanup_user_struct(struct work_struct *w)
290{ 292{
291 struct user_struct *up = container_of(w, struct user_struct, work); 293 struct user_struct *up = container_of(w, struct user_struct, work.work);
292 unsigned long flags; 294 unsigned long flags;
293 int remove_user = 0; 295 int remove_user = 0;
294 296
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
297 */ 299 */
298 uids_mutex_lock(); 300 uids_mutex_lock();
299 301
300 local_irq_save(flags); 302 spin_lock_irqsave(&uidhash_lock, flags);
301 303 if (atomic_read(&up->__count) == 0) {
302 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
303 uid_hash_remove(up); 304 uid_hash_remove(up);
304 remove_user = 1; 305 remove_user = 1;
305 spin_unlock_irqrestore(&uidhash_lock, flags);
306 } else {
307 local_irq_restore(flags);
308 } 306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
309 308
310 if (!remove_user) 309 if (!remove_user)
311 goto done; 310 goto done;
@@ -331,16 +330,28 @@ done:
331 */ 330 */
332static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
333{ 332{
334 /* restore back the count */
335 atomic_inc(&up->__count);
336 spin_unlock_irqrestore(&uidhash_lock, flags); 333 spin_unlock_irqrestore(&uidhash_lock, flags);
337 334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, cleanup_user_struct); 335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
339 schedule_work(&up->work);
340} 336}
341 337
342#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
343 339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{
342 struct user_struct *user;
343 struct hlist_node *h;
344
345 hlist_for_each_entry(user, h, hashent, uidhash_node) {
346 if (user->uid == uid) {
347 atomic_inc(&user->__count);
348 return user;
349 }
350 }
351
352 return NULL;
353}
354
344int uids_sysfs_init(void) { return 0; } 355int uids_sysfs_init(void) { return 0; }
345static inline int uids_user_create(struct user_struct *up) { return 0; } 356static inline int uids_user_create(struct user_struct *up) { return 0; }
346static inline void uids_mutex_lock(void) { } 357static inline void uids_mutex_lock(void) { }