aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c20
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cgroup_freezer.c21
-rw-r--r--kernel/compat.c25
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/groups.c6
-rw-r--r--kernel/hrtimer.c67
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c89
-rw-r--r--kernel/irq/proc.c60
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/lockdep.c5
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/pm_qos_params.c218
-rw-r--r--kernel/posix-cpu-timers.c298
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/snapshot.c145
-rw-r--r--kernel/power/swap.c333
-rw-r--r--kernel/power/user.c37
-rw-r--r--kernel/sched.c1
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sysctl.c579
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/clocksource.c48
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/timekeeping.c35
-rw-r--r--kernel/timer.c137
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--kernel/workqueue.c36
33 files changed, 1381 insertions, 989 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index e4c0e1fee9b0..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e9ec642932ee..291775021b2e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3615,7 +3615,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3615 * @ss: the subsystem to load 3615 * @ss: the subsystem to load
3616 * 3616 *
3617 * This function should be called in a modular subsystem's initcall. If the 3617 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set 3618 * subsystem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the 3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys. 3620 * simpler cgroup_init_subsys.
3621 */ 3621 */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e5c0244962b0..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
89 89
90/* Locks taken and their ordering 90/* Locks taken and their ordering
91 * ------------------------------ 91 * ------------------------------
92 * css_set_lock
93 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
94 * task->alloc_lock (AKA task_lock)
95 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
96 * task->sighand->siglock 96 * task->sighand->siglock
97 * 97 *
98 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
100 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
101 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
102 * 102 *
103 * can_attach(): 103 * freezer_can_attach():
104 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
105 * 105 *
106 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
107 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
108 * 108 *
109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
110 * task->alloc_lock (to get task's cgroup)
111 * freezer->lock 110 * freezer->lock
112 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
113 * 112 *
114 * freezer_read(): 113 * freezer_read():
115 * cgroup_mutex 114 * cgroup_mutex
116 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
118 * 119 *
119 * freezer_write() (freeze): 120 * freezer_write() (freeze):
120 * cgroup_mutex 121 * cgroup_mutex
121 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
122 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
123 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
124 * 127 *
125 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
126 * cgroup_mutex 129 * cgroup_mutex
127 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
128 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
129 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
130 * sighand->siglock 135 * sighand->siglock
131 */ 136 */
132static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
diff --git a/kernel/compat.c b/kernel/compat.c
index 7f40e9275fd9..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
495{ 495{
496 int ret; 496 int ret;
497 cpumask_var_t mask; 497 cpumask_var_t mask;
498 unsigned long *k;
499 unsigned int min_length = cpumask_size();
500
501 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
502 min_length = sizeof(compat_ulong_t);
503 498
504 if (len < min_length) 499 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
500 return -EINVAL;
501 if (len & (sizeof(compat_ulong_t)-1))
505 return -EINVAL; 502 return -EINVAL;
506 503
507 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
508 return -ENOMEM; 505 return -ENOMEM;
509 506
510 ret = sched_getaffinity(pid, mask); 507 ret = sched_getaffinity(pid, mask);
511 if (ret < 0) 508 if (ret == 0) {
512 goto out; 509 size_t retlen = min_t(size_t, len, cpumask_size());
513 510
514 k = cpumask_bits(mask); 511 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
515 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = -EFAULT;
516 if (ret == 0) 513 else
517 ret = min_length; 514 ret = retlen;
518 515 }
519out:
520 free_cpumask_var(mask); 516 free_cpumask_var(mask);
517
521 return ret; 518 return ret;
522} 519}
523 520
diff --git a/kernel/cred.c b/kernel/cred.c
index 8f3672a58a1e..2c24870c55d1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -522,8 +522,6 @@ int commit_creds(struct cred *new)
522#endif 522#endif
523 BUG_ON(atomic_read(&new->usage) < 1); 523 BUG_ON(atomic_read(&new->usage) < 1);
524 524
525 security_commit_creds(new, old);
526
527 get_cred(new); /* we will require a ref for the subj creds too */ 525 get_cred(new); /* we will require a ref for the subj creds too */
528 526
529 /* dumpability changes */ 527 /* dumpability changes */
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 164 */
165int set_groups(struct cred *new, struct group_info *group_info) 165int set_groups(struct cred *new, struct group_info *group_info)
166{ 166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 167 put_group_info(new->group_info);
174 groups_sort(group_info); 168 groups_sort(group_info);
175 get_group_info(group_info); 169 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..b9b134b35088 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
1749} 1749}
1750 1750
1751/** 1751/**
1752 * schedule_hrtimeout_range - sleep until timeout 1752 * schedule_hrtimeout_range_clock - sleep until timeout
1753 * @expires: timeout value (ktime_t) 1753 * @expires: timeout value (ktime_t)
1754 * @delta: slack in expires timeout (ktime_t) 1754 * @delta: slack in expires timeout (ktime_t)
1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1756 * 1756 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1757 * Make the current task sleep until the given expiry time has
1758 * elapsed. The routine will return immediately unless
1759 * the current task state has been set (see set_current_state()).
1760 *
1761 * The @delta argument gives the kernel the freedom to schedule the
1762 * actual wakeup to a time that is both power and performance friendly.
1763 * The kernel give the normal best effort behavior for "@expires+@delta",
1764 * but may decide to fire the timer earlier, but no earlier than @expires.
1765 *
1766 * You can set the task state as follows -
1767 *
1768 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1769 * pass before the routine returns.
1770 *
1771 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1772 * delivered to the current task.
1773 *
1774 * The current task state is guaranteed to be TASK_RUNNING when this
1775 * routine returns.
1776 *
1777 * Returns 0 when the timer has expired otherwise -EINTR
1778 */ 1757 */
1779int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1758int __sched
1780 const enum hrtimer_mode mode) 1759schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1760 const enum hrtimer_mode mode, int clock)
1781{ 1761{
1782 struct hrtimer_sleeper t; 1762 struct hrtimer_sleeper t;
1783 1763
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1799 return -EINTR; 1779 return -EINTR;
1800 } 1780 }
1801 1781
1802 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1782 hrtimer_init_on_stack(&t.timer, clock, mode);
1803 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1783 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1804 1784
1805 hrtimer_init_sleeper(&t, current); 1785 hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1818 1798
1819 return !t.task ? 0 : -EINTR; 1799 return !t.task ? 0 : -EINTR;
1820} 1800}
1801
1802/**
1803 * schedule_hrtimeout_range - sleep until timeout
1804 * @expires: timeout value (ktime_t)
1805 * @delta: slack in expires timeout (ktime_t)
1806 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1807 *
1808 * Make the current task sleep until the given expiry time has
1809 * elapsed. The routine will return immediately unless
1810 * the current task state has been set (see set_current_state()).
1811 *
1812 * The @delta argument gives the kernel the freedom to schedule the
1813 * actual wakeup to a time that is both power and performance friendly.
1814 * The kernel give the normal best effort behavior for "@expires+@delta",
1815 * but may decide to fire the timer earlier, but no earlier than @expires.
1816 *
1817 * You can set the task state as follows -
1818 *
1819 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1820 * pass before the routine returns.
1821 *
1822 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1823 * delivered to the current task.
1824 *
1825 * The current task state is guaranteed to be TASK_RUNNING when this
1826 * routine returns.
1827 *
1828 * Returns 0 when the timer has expired otherwise -EINTR
1829 */
1830int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1831 const enum hrtimer_mode mode)
1832{
1833 return schedule_hrtimeout_range_clock(expires, delta, mode,
1834 CLOCK_MONOTONIC);
1835}
1821EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1836EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1822 1837
1823/** 1838/**
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
370 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0; 371 unsigned int status = 0;
372 372
373 if (!(action->flags & IRQF_DISABLED))
374 local_irq_enable_in_hardirq();
375
376 do { 373 do {
377 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
378 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488730a5..3164ba7ce151 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -757,16 +773,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
757 if (new->flags & IRQF_ONESHOT) 773 if (new->flags & IRQF_ONESHOT)
758 desc->status |= IRQ_ONESHOT; 774 desc->status |= IRQ_ONESHOT;
759 775
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
770 if (!(desc->status & IRQ_NOAUTOEN)) { 776 if (!(desc->status & IRQ_NOAUTOEN)) {
771 desc->depth = 0; 777 desc->depth = 0;
772 desc->status &= ~IRQ_DISABLED; 778 desc->status &= ~IRQ_DISABLED;
@@ -916,6 +922,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
916 desc->chip->disable(irq); 922 desc->chip->disable(irq);
917 } 923 }
918 924
925#ifdef CONFIG_SMP
926 /* make sure affinity_hint is cleaned up */
927 if (WARN_ON_ONCE(desc->affinity_hint))
928 desc->affinity_hint = NULL;
929#endif
930
919 raw_spin_unlock_irqrestore(&desc->lock, flags); 931 raw_spin_unlock_irqrestore(&desc->lock, flags);
920 932
921 unregister_handler_proc(irq, action); 933 unregister_handler_proc(irq, action);
@@ -1027,7 +1039,6 @@ EXPORT_SYMBOL(free_irq);
1027 * Flags: 1039 * Flags:
1028 * 1040 *
1029 * IRQF_SHARED Interrupt is shared 1041 * IRQF_SHARED Interrupt is shared
1030 * IRQF_DISABLED Disable local interrupts while processing
1031 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1042 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1032 * IRQF_TRIGGER_* Specify active edge(s) or level 1043 * IRQF_TRIGGER_* Specify active edge(s) or level
1033 * 1044 *
@@ -1041,25 +1052,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1041 int retval; 1052 int retval;
1042 1053
1043 /* 1054 /*
1044 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1045 * the _first_ irqaction (sigh). That can cause oopsing, but
1046 * the behavior is classified as "will not fix" so we need to
1047 * start nudging drivers away from using that idiom.
1048 */
1049 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1050 (IRQF_SHARED|IRQF_DISABLED)) {
1051 pr_warning(
1052 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1053 irq, devname);
1054 }
1055
1056#ifdef CONFIG_LOCKDEP
1057 /*
1058 * Lockdep wants atomic interrupt handlers:
1059 */
1060 irqflags |= IRQF_DISABLED;
1061#endif
1062 /*
1063 * Sanity-check: shared interrupts must pass in a real dev-ID, 1055 * Sanity-check: shared interrupts must pass in a real dev-ID,
1064 * otherwise we'll have trouble later trying to figure out 1056 * otherwise we'll have trouble later trying to figure out
1065 * which interrupt is which (messes up the interrupt freeing 1057 * which interrupt is which (messes up the interrupt freeing
@@ -1120,3 +1112,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1120 return retval; 1112 return retval;
1121} 1113}
1122EXPORT_SYMBOL(request_threaded_irq); 1114EXPORT_SYMBOL(request_threaded_irq);
1115
1116/**
1117 * request_any_context_irq - allocate an interrupt line
1118 * @irq: Interrupt line to allocate
1119 * @handler: Function to be called when the IRQ occurs.
1120 * Threaded handler for threaded interrupts.
1121 * @flags: Interrupt type flags
1122 * @name: An ascii name for the claiming device
1123 * @dev_id: A cookie passed back to the handler function
1124 *
1125 * This call allocates interrupt resources and enables the
1126 * interrupt line and IRQ handling. It selects either a
1127 * hardirq or threaded handling method depending on the
1128 * context.
1129 *
1130 * On failure, it returns a negative value. On success,
1131 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1132 */
1133int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1134 unsigned long flags, const char *name, void *dev_id)
1135{
1136 struct irq_desc *desc = irq_to_desc(irq);
1137 int ret;
1138
1139 if (!desc)
1140 return -EINVAL;
1141
1142 if (desc->status & IRQ_NESTED_THREAD) {
1143 ret = request_threaded_irq(irq, NULL, handler,
1144 flags, name, dev_id);
1145 return !ret ? IRQC_IS_NESTED : ret;
1146 }
1147
1148 ret = request_irq(irq, handler, flags, name, dev_id);
1149 return !ret ? IRQC_IS_HARDIRQ : ret;
1150}
1151EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04ef6b5..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
32 return 0; 32 return 0;
33} 33}
34 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
35#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
36#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
37#endif 58#endif
@@ -84,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
84 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
85} 106}
86 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
87static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
88 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
89 .read = seq_read, 115 .read = seq_read,
@@ -92,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
92 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
93}; 119};
94 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
95static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
96{ 129{
97 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -147,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
147 .release = single_release, 180 .release = single_release,
148 .write = default_affinity_write, 181 .write = default_affinity_write,
149}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
150#endif 203#endif
151 204
152static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -231,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
231 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
232 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
233 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
234#endif 294#endif
235 295
236 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
138extern const void __stop_notes __attribute__((weak)); 138extern const void __stop_notes __attribute__((weak));
139#define notes_size (&__stop_notes - &__start_notes) 139#define notes_size (&__stop_notes - &__start_notes)
140 140
141static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, 141static ssize_t notes_read(struct file *filp, struct kobject *kobj,
142 struct bin_attribute *bin_attr,
142 char *buf, loff_t off, size_t count) 143 char *buf, loff_t off, size_t count)
143{ 144{
144 memcpy(buf, &__start_notes + off, count); 145 memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ec21304856d1..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2711,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2711} 2711}
2712EXPORT_SYMBOL_GPL(lockdep_init_map); 2712EXPORT_SYMBOL_GPL(lockdep_init_map);
2713 2713
2714struct lock_class_key __lockdep_no_validate__;
2715
2714/* 2716/*
2715 * This gets called for every mutex_lock*()/spin_lock*() operation. 2717 * This gets called for every mutex_lock*()/spin_lock*() operation.
2716 * We maintain the dependency maps and validate the locking attempt: 2718 * We maintain the dependency maps and validate the locking attempt:
@@ -2745,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2745 return 0; 2747 return 0;
2746 } 2748 }
2747 2749
2750 if (lock->key == &__lockdep_no_validate__)
2751 check = 1;
2752
2748 if (!subclass) 2753 if (!subclass)
2749 class = lock->class_cache; 2754 class = lock->class_cache;
2750 /* 2755 /*
diff --git a/kernel/module.c b/kernel/module.c
index e2564580f3f1..5e14483768bb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1182,7 +1182,7 @@ struct module_notes_attrs {
1182 struct bin_attribute attrs[0]; 1182 struct bin_attribute attrs[0];
1183}; 1183};
1184 1184
1185static ssize_t module_notes_read(struct kobject *kobj, 1185static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
1186 struct bin_attribute *bin_attr, 1186 struct bin_attribute *bin_attr,
1187 char *buf, loff_t pos, size_t count) 1187 char *buf, loff_t pos, size_t count)
1188{ 1188{
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,25 +44,25 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51struct pm_qos_request_list {
50 struct list_head list; 52 struct list_head list;
51 union { 53 union {
52 s32 value; 54 s32 value;
53 s32 usec; 55 s32 usec;
54 s32 kbps; 56 s32 kbps;
55 }; 57 };
56 char *name; 58 int pm_qos_class;
57}; 59};
58 60
59static s32 max_compare(s32 v1, s32 v2); 61static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2); 62static s32 min_compare(s32 v1, s32 v2);
61 63
62struct pm_qos_object { 64struct pm_qos_object {
63 struct requirement_list requirements; 65 struct pm_qos_request_list requests;
64 struct blocking_notifier_head *notifiers; 66 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 67 struct miscdevice pm_qos_power_miscdev;
66 char *name; 68 char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
72static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
76 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 80 .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
82 84
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
86 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 89 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 90 .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
93 95
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
135} 136}
136 137
137 138
138static void update_target(int target) 139static void update_target(int pm_qos_class)
139{ 140{
140 s32 extreme_value; 141 s32 extreme_value;
141 struct requirement_list *node; 142 struct pm_qos_request_list *node;
142 unsigned long flags; 143 unsigned long flags;
143 int call_notifier = 0; 144 int call_notifier = 0;
144 145
145 spin_lock_irqsave(&pm_qos_lock, flags); 146 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 147 extreme_value = pm_qos_array[pm_qos_class]->default_value;
147 list_for_each_entry(node, 148 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) { 149 &pm_qos_array[pm_qos_class]->requests.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[pm_qos_class]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
154 extreme_value) {
153 call_notifier = 1; 155 call_notifier = 1;
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 156 atomic_set(&pm_qos_array[pm_qos_class]->target_value,
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 157 extreme_value);
156 atomic_read(&pm_qos_array[target]->target_value)); 158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value));
157 } 160 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 161 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 162
160 if (call_notifier) 163 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 164 blocking_notifier_call_chain(
162 (unsigned long) extreme_value, NULL); 165 pm_qos_array[pm_qos_class]->notifiers,
166 (unsigned long) extreme_value, NULL);
163} 167}
164 168
165static int register_pm_qos_misc(struct pm_qos_object *qos) 169static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
185} 189}
186 190
187/** 191/**
188 * pm_qos_requirement - returns current system wide qos expectation 192 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 193 * @pm_qos_class: identification of which qos value is requested
190 * 194 *
191 * This function returns the current target value in an atomic manner. 195 * This function returns the current target value in an atomic manner.
192 */ 196 */
193int pm_qos_requirement(int pm_qos_class) 197int pm_qos_request(int pm_qos_class)
194{ 198{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196} 200}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 201EXPORT_SYMBOL_GPL(pm_qos_request);
198 202
199/** 203/**
200 * pm_qos_add_requirement - inserts new qos request into the list 204 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 205 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 206 * @value: defines the qos request
204 * 207 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 208 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 209 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 210 * for the pm_qos_class of parameters, and returns the pm_qos_request list
211 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use.
208 */ 213 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
210{ 215{
211 struct requirement_list *dep; 216 struct pm_qos_request_list *dep;
212 unsigned long flags; 217 unsigned long flags;
213 218
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
215 if (dep) { 220 if (dep) {
216 if (value == PM_QOS_DEFAULT_VALUE) 221 if (value == PM_QOS_DEFAULT_VALUE)
217 dep->value = pm_qos_array[pm_qos_class]->default_value; 222 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else 223 else
219 dep->value = value; 224 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL); 225 dep->pm_qos_class = pm_qos_class;
221 if (!dep->name)
222 goto cleanup;
223 226
224 spin_lock_irqsave(&pm_qos_lock, flags); 227 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list, 228 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list); 229 &pm_qos_array[pm_qos_class]->requests.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags); 230 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class); 231 update_target(pm_qos_class);
229
230 return 0;
231 } 232 }
232 233
233cleanup: 234 return dep;
234 kfree(dep);
235 return -ENOMEM;
236} 235}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 236EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 237
239/** 238/**
240 * pm_qos_update_requirement - modifies an existing qos request 239 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 240 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 241 * @value: defines the qos request
244 * 242 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 243 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 244 * with updating the target pm_qos_class value.
247 * 245 *
248 * If the named request isn't in the list then no change is made. 246 * Attempts are made to make this code callable on hot code paths.
249 */ 247 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value)
251{ 250{
252 unsigned long flags; 251 unsigned long flags;
253 struct requirement_list *node;
254 int pending_update = 0; 252 int pending_update = 0;
253 s32 temp;
255 254
256 spin_lock_irqsave(&pm_qos_lock, flags); 255 if (pm_qos_req) { /*guard against callers passing in null */
257 list_for_each_entry(node, 256 spin_lock_irqsave(&pm_qos_lock, flags);
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 257 if (new_value == PM_QOS_DEFAULT_VALUE)
259 if (strcmp(node->name, name) == 0) { 258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
260 if (new_value == PM_QOS_DEFAULT_VALUE) 259 else
261 node->value = 260 temp = new_value;
262 pm_qos_array[pm_qos_class]->default_value; 261
263 else 262 if (temp != pm_qos_req->value) {
264 node->value = new_value;
265 pending_update = 1; 263 pending_update = 1;
266 break; 264 pm_qos_req->value = temp;
267 } 265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
268 } 269 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272
273 return 0;
274} 270}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 271EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 272
277/** 273/**
278 * pm_qos_remove_requirement - modifies an existing qos request 274 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 275 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 276 *
282 * Will remove named qos request from pm_qos_class list of parameters and 277 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 278 * recompute the current target value for the pm_qos_class. Call this
279 * on slow code paths.
284 */ 280 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 282{
287 unsigned long flags; 283 unsigned long flags;
288 struct requirement_list *node; 284 int qos_class;
289 int pending_update = 0;
290 285
286 if (pm_qos_req == NULL)
287 return;
288 /* silent return to keep pcm code cleaner */
289
290 qos_class = pm_qos_req->pm_qos_class;
291 spin_lock_irqsave(&pm_qos_lock, flags); 291 spin_lock_irqsave(&pm_qos_lock, flags);
292 list_for_each_entry(node, 292 list_del(&pm_qos_req->list);
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 293 kfree(pm_qos_req);
294 if (strcmp(node->name, name) == 0) {
295 kfree(node->name);
296 list_del(&node->list);
297 kfree(node);
298 pending_update = 1;
299 break;
300 }
301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 294 spin_unlock_irqrestore(&pm_qos_lock, flags);
303 if (pending_update) 295 update_target(qos_class);
304 update_target(pm_qos_class);
305} 296}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 297EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 298
308/** 299/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 300 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 304 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 305 * upon changes to the pm_qos_class target value.
315 */ 306 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 307int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 308{
318 int retval; 309 int retval;
319 310
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 334}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 335EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 336
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 337static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 338{
350 int ret;
351 long pm_qos_class; 339 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 340
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 342 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 344 PM_QOS_DEFAULT_VALUE);
358 ret = pm_qos_add_requirement(pm_qos_class, name, 345
359 PM_QOS_DEFAULT_VALUE); 346 if (filp->private_data)
360 if (ret >= 0)
361 return 0; 347 return 0;
362 } 348 }
363 return -EPERM; 349 return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 351
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 352static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 353{
368 int pm_qos_class; 354 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 355
371 pm_qos_class = (long)filp->private_data; 356 req = (struct pm_qos_request_list *)filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 357 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name);
374 358
375 return 0; 359 return 0;
376} 360}
377 361
362
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 363static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 364 size_t count, loff_t *f_pos)
380{ 365{
381 s32 value; 366 s32 value;
382 int pm_qos_class; 367 int x;
383 char name[PID_NAME_LEN]; 368 char ascii_value[11];
384 369 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 370
386 if (count != sizeof(s32)) 371 if (count == sizeof(s32)) {
372 if (copy_from_user(&value, buf, sizeof(s32)))
373 return -EFAULT;
374 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT;
377 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1)
379 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
381 } else
387 return -EINVAL; 382 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 383
393 return sizeof(s32); 384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
385 pm_qos_update_request(pm_qos_req, value);
386
387 return count;
394} 388}
395 389
396 390
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bc7704b3a443..00bb252f29a2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&current->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 547 cputime_gt(expires, new_exp);
549} 548}
550 549
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 550/*
557 * Insert the timer on the appropriate list before any timers that 551 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 552 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 553 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 554 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 555static void arm_timer(struct k_itimer *timer)
562{ 556{
563 struct task_struct *p = timer->it.cpu.task; 557 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 558 struct list_head *head, *listpos;
559 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 560 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 561 struct cpu_timer_list *next;
567 unsigned long i;
568 562
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 563 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 564 head = p->cpu_timers;
565 cputime_expires = &p->cputime_expires;
566 } else {
567 head = p->signal->cpu_timers;
568 cputime_expires = &p->signal->cputime_expires;
569 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 570 head += CPUCLOCK_WHICH(timer->it_clock);
572 571
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 572 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 573 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 574 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 575 break;
580 break; 576 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 577 }
590 list_add(&nt->entry, listpos); 578 list_add(&nt->entry, listpos);
591 579
592 if (listpos == head) { 580 if (listpos == head) {
581 union cpu_time_count *exp = &nt->expires;
582
593 /* 583 /*
594 * We are the new earliest-expiring timer. 584 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 585 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 586 * for process timers we share expiration cache with itimers
587 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 588 */
598 589
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 590 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 591 case CPUCLOCK_PROF:
601 592 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 593 cputime_expires->prof_exp = exp->cpu;
603 default: 594 break;
604 BUG(); 595 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 596 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 597 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 598 break;
608 p->cputime_expires.prof_exp = exp->cpu; 599 case CPUCLOCK_SCHED:
609 break; 600 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 601 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 602 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 603 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 604 }
653 } 605 }
654
655 spin_unlock(&p->sighand->siglock);
656} 606}
657 607
658/* 608/*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 610 */
661static void cpu_timer_fire(struct k_itimer *timer) 611static void cpu_timer_fire(struct k_itimer *timer)
662{ 612{
663 if (unlikely(timer->sigq == NULL)) { 613 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
614 /*
615 * User don't want any signal.
616 */
617 timer->it.cpu.expires.sched = 0;
618 } else if (unlikely(timer->sigq == NULL)) {
664 /* 619 /*
665 * This a special case for clock_nanosleep, 620 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 621 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 676 struct itimerspec *new, struct itimerspec *old)
722{ 677{
723 struct task_struct *p = timer->it.cpu.task; 678 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 679 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 680 int ret;
726 681
727 if (unlikely(p == NULL)) { 682 if (unlikely(p == NULL)) {
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 707 BUG_ON(!irqs_disabled());
753 708
754 ret = 0; 709 ret = 0;
710 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 711 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 712 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 713 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 715 ret = TIMER_RETRY;
760 } else 716 } else
761 list_del_init(&timer->it.cpu.entry); 717 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 718
764 /* 719 /*
765 * We need to sample the current value to convert the new 720 * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 768 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 769 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 770 */
771 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 772 read_unlock(&tasklist_lock);
817 goto out; 773 goto out;
818 } 774 }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 784 */
829 timer->it.cpu.expires = new_expires; 785 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 786 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 787 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 788 arm_timer(timer);
834 } 789 }
835 790
791 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 792 read_unlock(&tasklist_lock);
837 793
838 /* 794 /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 809 timer->it_overrun = -1;
854 810
855 if (new_expires.sched != 0 && 811 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 812 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 813 /*
859 * The designated time already passed, so we notify 814 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 822 out:
868 if (old) { 823 if (old) {
869 sample_to_timespec(timer->it_clock, 824 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 825 old_incr, &old->it_interval);
871 } 826 }
872 return ret; 827 return ret;
873} 828}
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 882 read_unlock(&tasklist_lock);
928 } 883 }
929 884
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 885 if (unlikely(clear_dead)) {
950 /* 886 /*
951 * We've noticed that the thread is dead, but 887 * We've noticed that the thread is dead, but
@@ -1066,16 +1002,9 @@ static void stop_process_timers(struct signal_struct *sig)
1066 struct thread_group_cputimer *cputimer = &sig->cputimer; 1002 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 1003 unsigned long flags;
1068 1004
1069 if (!cputimer->running)
1070 return;
1071
1072 spin_lock_irqsave(&cputimer->lock, flags); 1005 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 1006 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1007 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1079} 1008}
1080 1009
1081static u32 onecputick; 1010static u32 onecputick;
@@ -1112,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1112 } 1041 }
1113} 1042}
1114 1043
1044/**
1045 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1046 *
1047 * @cputime: The struct to compare.
1048 *
1049 * Checks @cputime to see if all fields are zero. Returns true if all fields
1050 * are zero, false if any field is nonzero.
1051 */
1052static inline int task_cputime_zero(const struct task_cputime *cputime)
1053{
1054 if (cputime_eq(cputime->utime, cputime_zero) &&
1055 cputime_eq(cputime->stime, cputime_zero) &&
1056 cputime->sum_exec_runtime == 0)
1057 return 1;
1058 return 0;
1059}
1060
1115/* 1061/*
1116 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1117 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1129,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk,
1129 unsigned long soft; 1075 unsigned long soft;
1130 1076
1131 /* 1077 /*
1132 * Don't sample the current process CPU clocks if there are no timers.
1133 */
1134 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1135 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1136 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1140 stop_process_timers(sig);
1141 return;
1142 }
1143
1144 /*
1145 * Collect the current process totals. 1078 * Collect the current process totals.
1146 */ 1079 */
1147 thread_group_cputimer(tsk, &cputime); 1080 thread_group_cputimer(tsk, &cputime);
@@ -1230,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk,
1230 } 1163 }
1231 } 1164 }
1232 1165
1233 if (!cputime_eq(prof_expires, cputime_zero) && 1166 sig->cputime_expires.prof_exp = prof_expires;
1234 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1167 sig->cputime_expires.virt_exp = virt_expires;
1235 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1168 sig->cputime_expires.sched_exp = sched_expires;
1236 sig->cputime_expires.prof_exp = prof_expires; 1169 if (task_cputime_zero(&sig->cputime_expires))
1237 if (!cputime_eq(virt_expires, cputime_zero) && 1170 stop_process_timers(sig);
1238 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1239 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1240 sig->cputime_expires.virt_exp = virt_expires;
1241 if (sched_expires != 0 &&
1242 (sig->cputime_expires.sched_exp == 0 ||
1243 sig->cputime_expires.sched_exp > sched_expires))
1244 sig->cputime_expires.sched_exp = sched_expires;
1245} 1171}
1246 1172
1247/* 1173/*
@@ -1270,6 +1196,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1270 goto out; 1196 goto out;
1271 } 1197 }
1272 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1198 read_lock(&tasklist_lock); /* arm_timer needs it. */
1199 spin_lock(&p->sighand->siglock);
1273 } else { 1200 } else {
1274 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1275 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->signal == NULL)) {
@@ -1290,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1290 clear_dead_task(timer, now); 1217 clear_dead_task(timer, now);
1291 goto out_unlock; 1218 goto out_unlock;
1292 } 1219 }
1220 spin_lock(&p->sighand->siglock);
1293 cpu_timer_sample_group(timer->it_clock, p, &now); 1221 cpu_timer_sample_group(timer->it_clock, p, &now);
1294 bump_cpu_timer(timer, now); 1222 bump_cpu_timer(timer, now);
1295 /* Leave the tasklist_lock locked for the call below. */ 1223 /* Leave the tasklist_lock locked for the call below. */
@@ -1298,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1298 /* 1226 /*
1299 * Now re-arm for the new expiry time. 1227 * Now re-arm for the new expiry time.
1300 */ 1228 */
1301 arm_timer(timer, now); 1229 BUG_ON(!irqs_disabled());
1230 arm_timer(timer);
1231 spin_unlock(&p->sighand->siglock);
1302 1232
1303out_unlock: 1233out_unlock:
1304 read_unlock(&tasklist_lock); 1234 read_unlock(&tasklist_lock);
@@ -1310,23 +1240,6 @@ out:
1310} 1240}
1311 1241
1312/** 1242/**
1313 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1314 *
1315 * @cputime: The struct to compare.
1316 *
1317 * Checks @cputime to see if all fields are zero. Returns true if all fields
1318 * are zero, false if any field is nonzero.
1319 */
1320static inline int task_cputime_zero(const struct task_cputime *cputime)
1321{
1322 if (cputime_eq(cputime->utime, cputime_zero) &&
1323 cputime_eq(cputime->stime, cputime_zero) &&
1324 cputime->sum_exec_runtime == 0)
1325 return 1;
1326 return 0;
1327}
1328
1329/**
1330 * task_cputime_expired - Compare two task_cputime entities. 1243 * task_cputime_expired - Compare two task_cputime entities.
1331 * 1244 *
1332 * @sample: The task_cputime structure to be checked for expiration. 1245 * @sample: The task_cputime structure to be checked for expiration.
@@ -1382,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1382 } 1295 }
1383 1296
1384 sig = tsk->signal; 1297 sig = tsk->signal;
1385 if (!task_cputime_zero(&sig->cputime_expires)) { 1298 if (sig->cputimer.running) {
1386 struct task_cputime group_sample; 1299 struct task_cputime group_sample;
1387 1300
1388 thread_group_cputimer(tsk, &group_sample); 1301 thread_group_cputimer(tsk, &group_sample);
@@ -1390,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1390 return 1; 1303 return 1;
1391 } 1304 }
1392 1305
1393 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1306 return 0;
1394} 1307}
1395 1308
1396/* 1309/*
@@ -1419,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1419 * put them on the firing list. 1332 * put them on the firing list.
1420 */ 1333 */
1421 check_thread_timers(tsk, &firing); 1334 check_thread_timers(tsk, &firing);
1422 check_process_timers(tsk, &firing); 1335 /*
1336 * If there are any active process wide timers (POSIX 1.b, itimers,
1337 * RLIMIT_CPU) cputimer must be running.
1338 */
1339 if (tsk->signal->cputimer.running)
1340 check_process_timers(tsk, &firing);
1423 1341
1424 /* 1342 /*
1425 * We must release these locks before taking any timer's lock. 1343 * We must release these locks before taking any timer's lock.
@@ -1456,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1456} 1374}
1457 1375
1458/* 1376/*
1459 * Set one of the process-wide special case CPU timers. 1377 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1460 * The tsk->sighand->siglock must be held by the caller. 1378 * The tsk->sighand->siglock must be held by the caller.
1461 * The *newval argument is relative and we update it to be absolute, *oldval
1462 * is absolute and we update it to be relative.
1463 */ 1379 */
1464void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1380void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1465 cputime_t *newval, cputime_t *oldval) 1381 cputime_t *newval, cputime_t *oldval)
1466{ 1382{
1467 union cpu_time_count now; 1383 union cpu_time_count now;
1468 struct list_head *head;
1469 1384
1470 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1385 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1471 cpu_timer_sample_group(clock_idx, tsk, &now); 1386 cpu_timer_sample_group(clock_idx, tsk, &now);
1472 1387
1473 if (oldval) { 1388 if (oldval) {
1389 /*
1390 * We are setting itimer. The *oldval is absolute and we update
1391 * it to be relative, *newval argument is relative and we update
1392 * it to be absolute.
1393 */
1474 if (!cputime_eq(*oldval, cputime_zero)) { 1394 if (!cputime_eq(*oldval, cputime_zero)) {
1475 if (cputime_le(*oldval, now.cpu)) { 1395 if (cputime_le(*oldval, now.cpu)) {
1476 /* Just about to fire. */ 1396 /* Just about to fire. */
@@ -1483,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1483 if (cputime_eq(*newval, cputime_zero)) 1403 if (cputime_eq(*newval, cputime_zero))
1484 return; 1404 return;
1485 *newval = cputime_add(*newval, now.cpu); 1405 *newval = cputime_add(*newval, now.cpu);
1486
1487 /*
1488 * If the RLIMIT_CPU timer will expire before the
1489 * ITIMER_PROF timer, we have nothing else to do.
1490 */
1491 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1492 < cputime_to_secs(*newval))
1493 return;
1494 } 1406 }
1495 1407
1496 /* 1408 /*
1497 * Check whether there are any process timers already set to fire 1409 * Update expiration cache if we are the earliest timer, or eventually
1498 * before this one. If so, we don't have anything more to do. 1410 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1499 */ 1411 */
1500 head = &tsk->signal->cpu_timers[clock_idx]; 1412 switch (clock_idx) {
1501 if (list_empty(head) || 1413 case CPUCLOCK_PROF:
1502 cputime_ge(list_first_entry(head, 1414 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1503 struct cpu_timer_list, entry)->expires.cpu,
1504 *newval)) {
1505 switch (clock_idx) {
1506 case CPUCLOCK_PROF:
1507 tsk->signal->cputime_expires.prof_exp = *newval; 1415 tsk->signal->cputime_expires.prof_exp = *newval;
1508 break; 1416 break;
1509 case CPUCLOCK_VIRT: 1417 case CPUCLOCK_VIRT:
1418 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1510 tsk->signal->cputime_expires.virt_exp = *newval; 1419 tsk->signal->cputime_expires.virt_exp = *newval;
1511 break; 1420 break;
1512 }
1513 } 1421 }
1514} 1422}
1515 1423
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..524e058dcf06 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 13obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1604 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1605 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1606 * 1606 *
1607 * The @count parameter should contain the number of bytes the caller
1608 * wants to read from the snapshot. It must not be zero.
1609 *
1610 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1611 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1612 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1613 * may be smaller than @count, but this only happens if the read would
1614 * cross a page boundary otherwise.
1615 * 1610 *
1616 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1617 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1619 * any more. 1614 * any more.
1620 */ 1615 */
1621 1616
1622int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1623{ 1618{
1624 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1625 return 0; 1620 return 0;
@@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1630 if (!buffer) 1625 if (!buffer)
1631 return -ENOMEM; 1626 return -ENOMEM;
1632 } 1627 }
1633 if (!handle->offset) { 1628 if (!handle->cur) {
1634 int error; 1629 int error;
1635 1630
1636 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1639 handle->buffer = buffer; 1634 handle->buffer = buffer;
1640 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1641 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1642 } 1637 } else if (handle->cur <= nr_meta_pages) {
1643 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1644 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1645 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1646 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1647 } else {
1648 struct page *page;
1649 1642
1650 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1651 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1652 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1653 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1654 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1655 */ 1648 */
1656 void *kaddr; 1649 void *kaddr;
1657 1650
1658 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1659 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1660 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1661 handle->buffer = buffer; 1654 handle->buffer = buffer;
1662 } else { 1655 } else {
1663 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1664 }
1665 } 1657 }
1666 handle->prev = handle->cur;
1667 }
1668 handle->buf_offset = handle->cur_offset;
1669 if (handle->cur_offset + count >= PAGE_SIZE) {
1670 count = PAGE_SIZE - handle->cur_offset;
1671 handle->cur_offset = 0;
1672 handle->cur++;
1673 } else {
1674 handle->cur_offset += count;
1675 } 1658 }
1676 handle->offset += count; 1659 handle->cur++;
1677 return count; 1660 return PAGE_SIZE;
1678} 1661}
1679 1662
1680/** 1663/**
@@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2133 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2134 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2135 * 2118 *
2136 * The @count parameter should contain the number of bytes the caller
2137 * wants to write to the image. It must not be zero.
2138 *
2139 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2140 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2141 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2142 * may be smaller than @count, but this only happens if the write would
2143 * cross a page boundary otherwise.
2144 * 2122 *
2145 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2146 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2148 * any more. 2126 * any more.
2149 */ 2127 */
2150 2128
2151int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2152{ 2130{
2153 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2154 int error = 0; 2132 int error = 0;
2155 2133
2156 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2157 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2158 return 0; 2136 return 0;
2159 2137
2160 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2161 if (!buffer) 2141 if (!buffer)
2162 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2163 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2166 return -ENOMEM; 2146 return -ENOMEM;
2167 2147
2168 handle->buffer = buffer; 2148 handle->buffer = buffer;
2169 } 2149 } else if (handle->cur == 1) {
2170 handle->sync_read = 1; 2150 error = load_header(buffer);
2171 if (handle->prev < handle->cur) { 2151 if (error)
2172 if (handle->prev == 0) { 2152 return error;
2173 error = load_header(buffer);
2174 if (error)
2175 return error;
2176 2153
2177 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2178 if (error) 2155 if (error)
2179 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2180 2162
2181 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2182 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2183 if (error) 2165 if (error)
2184 return error; 2166 return error;
2185 2167
2186 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2187 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2188 if (error) 2170 restore_pblist = NULL;
2189 return error;
2190
2191 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2192 memory_bm_position_reset(&orig_bm);
2193 restore_pblist = NULL;
2194 handle->buffer = get_buffer(&orig_bm, &ca);
2195 handle->sync_read = 0;
2196 if (IS_ERR(handle->buffer))
2197 return PTR_ERR(handle->buffer);
2198 }
2199 } else {
2200 copy_last_highmem_page();
2201 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2202 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2203 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2204 if (handle->buffer != buffer)
2205 handle->sync_read = 0;
2206 } 2175 }
2207 handle->prev = handle->cur;
2208 }
2209 handle->buf_offset = handle->cur_offset;
2210 if (handle->cur_offset + count >= PAGE_SIZE) {
2211 count = PAGE_SIZE - handle->cur_offset;
2212 handle->cur_offset = 0;
2213 handle->cur++;
2214 } else { 2176 } else {
2215 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2216 } 2183 }
2217 handle->offset += count; 2184 handle->cur++;
2218 return count; 2185 return PAGE_SIZE;
2219} 2186}
2220 2187
2221/** 2188/**
@@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2230{ 2197{
2231 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2232 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2233 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2234 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2235 free_highmem_data(); 2202 free_highmem_data();
2236 } 2203 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -29,6 +29,40 @@
29 29
30#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
31 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
32struct swsusp_header { 66struct swsusp_header {
33 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
34 sector_t image; 68 sector_t image;
@@ -145,110 +179,24 @@ int swsusp_swap_in_use(void)
145 */ 179 */
146 180
147static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
148static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
149
150/**
151 * submit - submit BIO request.
152 * @rw: READ or WRITE.
153 * @off physical offset of page.
154 * @page: page we're reading or writing.
155 * @bio_chain: list of pending biod (for async reading)
156 *
157 * Straight from the textbook - allocate and initialize the bio.
158 * If we're reading, make sure the page is marked as dirty.
159 * Then submit it and, if @bio_chain == NULL, wait.
160 */
161static int submit(int rw, pgoff_t page_off, struct page *page,
162 struct bio **bio_chain)
163{
164 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
165 struct bio *bio;
166
167 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
168 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
169 bio->bi_bdev = resume_bdev;
170 bio->bi_end_io = end_swap_bio_read;
171
172 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
173 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
174 page_off);
175 bio_put(bio);
176 return -EFAULT;
177 }
178
179 lock_page(page);
180 bio_get(bio);
181
182 if (bio_chain == NULL) {
183 submit_bio(bio_rw, bio);
184 wait_on_page_locked(page);
185 if (rw == READ)
186 bio_set_pages_dirty(bio);
187 bio_put(bio);
188 } else {
189 if (rw == READ)
190 get_page(page); /* These pages are freed later */
191 bio->bi_private = *bio_chain;
192 *bio_chain = bio;
193 submit_bio(bio_rw, bio);
194 }
195 return 0;
196}
197
198static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
199{
200 return submit(READ, page_off, virt_to_page(addr), bio_chain);
201}
202
203static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
204{
205 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
206}
207
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235 183
236/* 184/*
237 * Saving part 185 * Saving part
238 */ 186 */
239 187
240static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
241{ 189{
242 int error; 190 int error;
243 191
244 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
245 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
246 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
247 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
248 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
249 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
250 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
251 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
252 swsusp_header, NULL); 200 swsusp_header, NULL);
253 } else { 201 } else {
254 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
260/** 208/**
261 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
262 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
263 */ 213 */
264 214static int swsusp_swap_check(void)
265static int swsusp_swap_check(void) /* This is called before saving image */
266{ 215{
267 int res; 216 int res;
268 217
269 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
270 &resume_bdev); 219 &hib_resume_bdev);
271 if (res < 0) 220 if (res < 0)
272 return res; 221 return res;
273 222
274 root_swap = res; 223 root_swap = res;
275 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
276 if (res) 225 if (res)
277 return res; 226 return res;
278 227
279 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
280 if (res < 0) 229 if (res < 0)
281 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
282 231
283 return res; 232 return res;
284} 233}
@@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
309 } else { 258 } else {
310 src = buf; 259 src = buf;
311 } 260 }
312 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
313} 262}
314 263
315/*
316 * The swap map is a data structure used for keeping track of each page
317 * written to a swap partition. It consists of many swap_map_page
318 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
319 * These structures are stored on the swap and linked together with the
320 * help of the .next_swap member.
321 *
322 * The swap map is created during suspend. The swap map pages are
323 * allocated and populated one at a time, so we only need one memory
324 * page to set up the entire structure.
325 *
326 * During resume we also only need to use one swap_map_page structure
327 * at a time.
328 */
329
330#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
331
332struct swap_map_page {
333 sector_t entries[MAP_PAGE_ENTRIES];
334 sector_t next_swap;
335};
336
337/**
338 * The swap_map_handle structure is used for handling swap in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 sector_t cur_swap;
345 unsigned int k;
346};
347
348static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
349{ 265{
350 if (handle->cur) 266 if (handle->cur)
@@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
354 270
355static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
356{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
357 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
358 if (!handle->cur) 283 if (!handle->cur) {
359 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
360 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
361 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
362 release_swap_writer(handle); 289 ret = -ENOSPC;
363 return -ENOSPC; 290 goto err_rel;
364 } 291 }
365 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
366 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
367} 300}
368 301
369static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
380 return error; 313 return error;
381 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
382 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
383 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
384 if (error) 317 if (error)
385 goto out; 318 goto out;
386 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
406 return -EINVAL; 339 return -EINVAL;
407} 340}
408 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
409/** 360/**
410 * save_image - save the suspend image data 361 * save_image - save the suspend image data
411 */ 362 */
@@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
431 bio = NULL; 382 bio = NULL;
432 do_gettimeofday(&start); 383 do_gettimeofday(&start);
433 while (1) { 384 while (1) {
434 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
435 if (ret <= 0) 386 if (ret <= 0)
436 break; 387 break;
437 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
442 nr_pages++; 393 nr_pages++;
443 } 394 }
444 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
445 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
446 if (!ret) 397 if (!ret)
447 ret = err2; 398 ret = err2;
@@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags)
483 struct swap_map_handle handle; 434 struct swap_map_handle handle;
484 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
485 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
486 int error; 438 int error;
487 439
488 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
489 if (error) { 442 if (error) {
490 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
491 "swapon -a.\n");
492 return error; 444 return error;
493 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
494 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
495 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
496 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
497 if (error >= 0) 454 if (error >= 0)
498 error = -EFAULT; 455 error = -EFAULT;
499 456
500 goto out; 457 goto out_finish;
501 } 458 }
502 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
503 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
504 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
505 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
506 goto out; 463out_finish:
507 } 464 error = swap_writer_finish(&handle, flags, error);
508 error = get_swap_writer(&handle);
509 if (!error) {
510 sector_t start = handle.cur_swap;
511
512 error = swap_write_page(&handle, header, NULL);
513 if (!error)
514 error = save_image(&handle, &snapshot,
515 header->pages - 1);
516
517 if (!error) {
518 flush_swap_writer(&handle);
519 printk(KERN_INFO "PM: S");
520 error = mark_swapfiles(start, flags);
521 printk("|\n");
522 }
523 }
524 if (error)
525 free_all_swap_pages(root_swap);
526
527 release_swap_writer(&handle);
528 out:
529 swsusp_close(FMODE_WRITE);
530 return error; 465 return error;
531} 466}
532 467
@@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
542 handle->cur = NULL; 477 handle->cur = NULL;
543} 478}
544 479
545static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
546{ 482{
547 int error; 483 int error;
548 484
549 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
550 return -EINVAL; 488 return -EINVAL;
551 489
552 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
553 if (!handle->cur) 491 if (!handle->cur)
554 return -ENOMEM; 492 return -ENOMEM;
555 493
556 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
557 if (error) { 495 if (error) {
558 release_swap_reader(handle); 496 release_swap_reader(handle);
559 return error; 497 return error;
@@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
573 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
574 if (!offset) 512 if (!offset)
575 return -EFAULT; 513 return -EFAULT;
576 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
577 if (error) 515 if (error)
578 return error; 516 return error;
579 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
580 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
581 handle->k = 0; 519 handle->k = 0;
582 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
583 if (!offset) 521 if (!offset)
584 release_swap_reader(handle); 522 release_swap_reader(handle);
585 else if (!error) 523 else if (!error)
586 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
587 } 525 }
588 return error; 526 return error;
589} 527}
590 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
591/** 536/**
592 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
593 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
615 bio = NULL; 560 bio = NULL;
616 do_gettimeofday(&start); 561 do_gettimeofday(&start);
617 for ( ; ; ) { 562 for ( ; ; ) {
618 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
619 if (error <= 0) 564 if (error <= 0)
620 break; 565 break;
621 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
622 if (error) 567 if (error)
623 break; 568 break;
624 if (snapshot->sync_read) 569 if (snapshot->sync_read)
625 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
626 if (error) 571 if (error)
627 break; 572 break;
628 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
629 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
630 nr_pages++; 575 nr_pages++;
631 } 576 }
632 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
633 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
634 if (!error) 579 if (!error)
635 error = err2; 580 error = err2;
@@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
657 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
658 struct swsusp_info *header; 603 struct swsusp_info *header;
659 604
660 *flags_p = swsusp_header->flags;
661
662 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
663 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
664 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
665 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
666 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
667 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
668 if (!error) 613 if (!error)
669 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
670 if (!error) 615 if (!error)
671 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
672 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
673 618end:
674 if (!error) 619 if (!error)
675 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
676 else 621 else
@@ -686,11 +631,11 @@ int swsusp_check(void)
686{ 631{
687 int error; 632 int error;
688 633
689 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
690 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
691 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
692 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
693 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
694 swsusp_header, NULL); 639 swsusp_header, NULL);
695 if (error) 640 if (error)
696 goto put; 641 goto put;
@@ -698,7 +643,7 @@ int swsusp_check(void)
698 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
699 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
700 /* Reset swap signature now */ 645 /* Reset swap signature now */
701 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
702 swsusp_header, NULL); 647 swsusp_header, NULL);
703 } else { 648 } else {
704 error = -EINVAL; 649 error = -EINVAL;
@@ -706,11 +651,11 @@ int swsusp_check(void)
706 651
707put: 652put:
708 if (error) 653 if (error)
709 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
710 else 655 else
711 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
712 } else { 657 } else {
713 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
714 } 659 }
715 660
716 if (error) 661 if (error)
@@ -725,12 +670,12 @@ put:
725 670
726void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
727{ 672{
728 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
729 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
730 return; 675 return;
731 } 676 }
732 677
733 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
734} 679}
735 680
736static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
diff --git a/kernel/sched.c b/kernel/sched.c
index 1d93cd0ae4d3..d9c0368eeb21 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3851,6 +3851,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3851{ 3851{
3852 __wake_up_common(q, mode, 1, 0, NULL); 3852 __wake_up_common(q, mode, 1, 0, NULL);
3853} 3853}
3854EXPORT_SYMBOL_GPL(__wake_up_locked);
3854 3855
3855void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3856void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3856{ 3857{
diff --git a/kernel/sys.c b/kernel/sys.c
index 7cb426a58965..0d36d889c74d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -492,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
492 return -ENOMEM; 492 return -ENOMEM;
493 old = current_cred(); 493 old = current_cred();
494 494
495 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
496 if (retval)
497 goto error;
498
499 retval = -EPERM; 495 retval = -EPERM;
500 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
501 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -543,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
543 return -ENOMEM; 539 return -ENOMEM;
544 old = current_cred(); 540 old = current_cred();
545 541
546 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
547 if (retval)
548 goto error;
549
550 retval = -EPERM; 542 retval = -EPERM;
551 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
552 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -610,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
610 return -ENOMEM; 602 return -ENOMEM;
611 old = current_cred(); 603 old = current_cred();
612 604
613 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
614 if (retval)
615 goto error;
616
617 retval = -EPERM; 605 retval = -EPERM;
618 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
619 new->uid = ruid; 607 new->uid = ruid;
@@ -675,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
675 return -ENOMEM; 663 return -ENOMEM;
676 old = current_cred(); 664 old = current_cred();
677 665
678 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
679 if (retval)
680 goto error;
681
682 retval = -EPERM; 666 retval = -EPERM;
683 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
684 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -719,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
719 if (!new) 703 if (!new)
720 return -ENOMEM; 704 return -ENOMEM;
721 705
722 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
723 if (retval)
724 goto error;
725 old = current_cred(); 706 old = current_cred();
726 707
727 retval = -EPERM; 708 retval = -EPERM;
@@ -788,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
788 return -ENOMEM; 769 return -ENOMEM;
789 old = current_cred(); 770 old = current_cred();
790 771
791 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
792 if (retval)
793 goto error;
794
795 retval = -EPERM; 772 retval = -EPERM;
796 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
797 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -851,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
851 old = current_cred(); 828 old = current_cred();
852 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
853 830
854 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
855 goto error;
856
857 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
858 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
859 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -864,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
864 } 838 }
865 } 839 }
866 840
867error:
868 abort_creds(new); 841 abort_creds(new);
869 return old_fsuid; 842 return old_fsuid;
870 843
@@ -888,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
888 old = current_cred(); 861 old = current_cred();
889 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
890 863
891 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
892 goto error;
893
894 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
895 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
896 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -900,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
900 } 870 }
901 } 871 }
902 872
903error:
904 abort_creds(new); 873 abort_creds(new);
905 return old_fsgid; 874 return old_fsgid;
906 875
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..b12583047757 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -163,6 +163,27 @@ static int proc_taint(struct ctl_table *table, int write,
163 void __user *buffer, size_t *lenp, loff_t *ppos); 163 void __user *buffer, size_t *lenp, loff_t *ppos);
164#endif 164#endif
165 165
166#ifdef CONFIG_MAGIC_SYSRQ
167static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
168
169static int sysrq_sysctl_handler(ctl_table *table, int write,
170 void __user *buffer, size_t *lenp,
171 loff_t *ppos)
172{
173 int error;
174
175 error = proc_dointvec(table, write, buffer, lenp, ppos);
176 if (error)
177 return error;
178
179 if (write)
180 sysrq_toggle_support(__sysrq_enabled);
181
182 return 0;
183}
184
185#endif
186
166static struct ctl_table root_table[]; 187static struct ctl_table root_table[];
167static struct ctl_table_root sysctl_table_root; 188static struct ctl_table_root sysctl_table_root;
168static struct ctl_table_header root_table_header = { 189static struct ctl_table_header root_table_header = {
@@ -567,7 +588,7 @@ static struct ctl_table kern_table[] = {
567 .data = &__sysrq_enabled, 588 .data = &__sysrq_enabled,
568 .maxlen = sizeof (int), 589 .maxlen = sizeof (int),
569 .mode = 0644, 590 .mode = 0644,
570 .proc_handler = proc_dointvec, 591 .proc_handler = sysrq_sysctl_handler,
571 }, 592 },
572#endif 593#endif
573#ifdef CONFIG_PROC_SYSCTL 594#ifdef CONFIG_PROC_SYSCTL
@@ -621,7 +642,7 @@ static struct ctl_table kern_table[] = {
621#endif 642#endif
622 { 643 {
623 .procname = "userprocess_debug", 644 .procname = "userprocess_debug",
624 .data = &sysctl_userprocess_debug, 645 .data = &show_unhandled_signals,
625 .maxlen = sizeof(int), 646 .maxlen = sizeof(int),
626 .mode = 0644, 647 .mode = 0644,
627 .proc_handler = proc_dointvec, 648 .proc_handler = proc_dointvec,
@@ -1431,7 +1452,8 @@ static struct ctl_table fs_table[] = {
1431}; 1452};
1432 1453
1433static struct ctl_table debug_table[] = { 1454static struct ctl_table debug_table[] = {
1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) 1455#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1456 defined(CONFIG_S390)
1435 { 1457 {
1436 .procname = "exception-trace", 1458 .procname = "exception-trace",
1437 .data = &show_unhandled_signals, 1459 .data = &show_unhandled_signals,
@@ -2040,8 +2062,132 @@ int proc_dostring(struct ctl_table *table, int write,
2040 buffer, lenp, ppos); 2062 buffer, lenp, ppos);
2041} 2063}
2042 2064
2065static size_t proc_skip_spaces(char **buf)
2066{
2067 size_t ret;
2068 char *tmp = skip_spaces(*buf);
2069 ret = tmp - *buf;
2070 *buf = tmp;
2071 return ret;
2072}
2073
2074static void proc_skip_char(char **buf, size_t *size, const char v)
2075{
2076 while (*size) {
2077 if (**buf != v)
2078 break;
2079 (*size)--;
2080 (*buf)++;
2081 }
2082}
2083
2084#define TMPBUFLEN 22
2085/**
2086 * proc_get_long - reads an ASCII formated integer from a user buffer
2087 *
2088 * @buf - a kernel buffer
2089 * @size - size of the kernel buffer
2090 * @val - this is where the number will be stored
2091 * @neg - set to %TRUE if number is negative
2092 * @perm_tr - a vector which contains the allowed trailers
2093 * @perm_tr_len - size of the perm_tr vector
2094 * @tr - pointer to store the trailer character
2095 *
2096 * In case of success 0 is returned and buf and size are updated with
2097 * the amount of bytes read. If tr is non NULL and a trailing
2098 * character exist (size is non zero after returning from this
2099 * function) tr is updated with the trailing character.
2100 */
2101static int proc_get_long(char **buf, size_t *size,
2102 unsigned long *val, bool *neg,
2103 const char *perm_tr, unsigned perm_tr_len, char *tr)
2104{
2105 int len;
2106 char *p, tmp[TMPBUFLEN];
2107
2108 if (!*size)
2109 return -EINVAL;
2110
2111 len = *size;
2112 if (len > TMPBUFLEN - 1)
2113 len = TMPBUFLEN - 1;
2114
2115 memcpy(tmp, *buf, len);
2116
2117 tmp[len] = 0;
2118 p = tmp;
2119 if (*p == '-' && *size > 1) {
2120 *neg = true;
2121 p++;
2122 } else
2123 *neg = false;
2124 if (!isdigit(*p))
2125 return -EINVAL;
2126
2127 *val = simple_strtoul(p, &p, 0);
2128
2129 len = p - tmp;
2130
2131 /* We don't know if the next char is whitespace thus we may accept
2132 * invalid integers (e.g. 1234...a) or two integers instead of one
2133 * (e.g. 123...1). So lets not allow such large numbers. */
2134 if (len == TMPBUFLEN - 1)
2135 return -EINVAL;
2136
2137 if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
2138 return -EINVAL;
2139
2140 if (tr && (len < *size))
2141 *tr = *p;
2142
2143 *buf += len;
2144 *size -= len;
2145
2146 return 0;
2147}
2148
2149/**
2150 * proc_put_long - coverts an integer to a decimal ASCII formated string
2151 *
2152 * @buf - the user buffer
2153 * @size - the size of the user buffer
2154 * @val - the integer to be converted
2155 * @neg - sign of the number, %TRUE for negative
2156 *
2157 * In case of success 0 is returned and buf and size are updated with
2158 * the amount of bytes read.
2159 */
2160static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2161 bool neg)
2162{
2163 int len;
2164 char tmp[TMPBUFLEN], *p = tmp;
2165
2166 sprintf(p, "%s%lu", neg ? "-" : "", val);
2167 len = strlen(tmp);
2168 if (len > *size)
2169 len = *size;
2170 if (copy_to_user(*buf, tmp, len))
2171 return -EFAULT;
2172 *size -= len;
2173 *buf += len;
2174 return 0;
2175}
2176#undef TMPBUFLEN
2043 2177
2044static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 2178static int proc_put_char(void __user **buf, size_t *size, char c)
2179{
2180 if (*size) {
2181 char __user **buffer = (char __user **)buf;
2182 if (put_user(c, *buffer))
2183 return -EFAULT;
2184 (*size)--, (*buffer)++;
2185 *buf = *buffer;
2186 }
2187 return 0;
2188}
2189
2190static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2045 int *valp, 2191 int *valp,
2046 int write, void *data) 2192 int write, void *data)
2047{ 2193{
@@ -2050,33 +2196,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2050 } else { 2196 } else {
2051 int val = *valp; 2197 int val = *valp;
2052 if (val < 0) { 2198 if (val < 0) {
2053 *negp = -1; 2199 *negp = true;
2054 *lvalp = (unsigned long)-val; 2200 *lvalp = (unsigned long)-val;
2055 } else { 2201 } else {
2056 *negp = 0; 2202 *negp = false;
2057 *lvalp = (unsigned long)val; 2203 *lvalp = (unsigned long)val;
2058 } 2204 }
2059 } 2205 }
2060 return 0; 2206 return 0;
2061} 2207}
2062 2208
2209static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2210
2063static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2211static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2064 int write, void __user *buffer, 2212 int write, void __user *buffer,
2065 size_t *lenp, loff_t *ppos, 2213 size_t *lenp, loff_t *ppos,
2066 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2214 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2067 int write, void *data), 2215 int write, void *data),
2068 void *data) 2216 void *data)
2069{ 2217{
2070#define TMPBUFLEN 21 2218 int *i, vleft, first = 1, err = 0;
2071 int *i, vleft, first = 1, neg; 2219 unsigned long page = 0;
2072 unsigned long lval; 2220 size_t left;
2073 size_t left, len; 2221 char *kbuf;
2074 2222
2075 char buf[TMPBUFLEN], *p; 2223 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2076 char __user *s = buffer;
2077
2078 if (!tbl_data || !table->maxlen || !*lenp ||
2079 (*ppos && !write)) {
2080 *lenp = 0; 2224 *lenp = 0;
2081 return 0; 2225 return 0;
2082 } 2226 }
@@ -2088,89 +2232,69 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2088 if (!conv) 2232 if (!conv)
2089 conv = do_proc_dointvec_conv; 2233 conv = do_proc_dointvec_conv;
2090 2234
2235 if (write) {
2236 if (left > PAGE_SIZE - 1)
2237 left = PAGE_SIZE - 1;
2238 page = __get_free_page(GFP_TEMPORARY);
2239 kbuf = (char *) page;
2240 if (!kbuf)
2241 return -ENOMEM;
2242 if (copy_from_user(kbuf, buffer, left)) {
2243 err = -EFAULT;
2244 goto free;
2245 }
2246 kbuf[left] = 0;
2247 }
2248
2091 for (; left && vleft--; i++, first=0) { 2249 for (; left && vleft--; i++, first=0) {
2092 if (write) { 2250 unsigned long lval;
2093 while (left) { 2251 bool neg;
2094 char c;
2095 if (get_user(c, s))
2096 return -EFAULT;
2097 if (!isspace(c))
2098 break;
2099 left--;
2100 s++;
2101 }
2102 if (!left)
2103 break;
2104 neg = 0;
2105 len = left;
2106 if (len > sizeof(buf) - 1)
2107 len = sizeof(buf) - 1;
2108 if (copy_from_user(buf, s, len))
2109 return -EFAULT;
2110 buf[len] = 0;
2111 p = buf;
2112 if (*p == '-' && left > 1) {
2113 neg = 1;
2114 p++;
2115 }
2116 if (*p < '0' || *p > '9')
2117 break;
2118 2252
2119 lval = simple_strtoul(p, &p, 0); 2253 if (write) {
2254 left -= proc_skip_spaces(&kbuf);
2120 2255
2121 len = p-buf; 2256 err = proc_get_long(&kbuf, &left, &lval, &neg,
2122 if ((len < left) && *p && !isspace(*p)) 2257 proc_wspace_sep,
2258 sizeof(proc_wspace_sep), NULL);
2259 if (err)
2123 break; 2260 break;
2124 s += len; 2261 if (conv(&neg, &lval, i, 1, data)) {
2125 left -= len; 2262 err = -EINVAL;
2126
2127 if (conv(&neg, &lval, i, 1, data))
2128 break; 2263 break;
2264 }
2129 } else { 2265 } else {
2130 p = buf; 2266 if (conv(&neg, &lval, i, 0, data)) {
2267 err = -EINVAL;
2268 break;
2269 }
2131 if (!first) 2270 if (!first)
2132 *p++ = '\t'; 2271 err = proc_put_char(&buffer, &left, '\t');
2133 2272 if (err)
2134 if (conv(&neg, &lval, i, 0, data)) 2273 break;
2274 err = proc_put_long(&buffer, &left, lval, neg);
2275 if (err)
2135 break; 2276 break;
2136
2137 sprintf(p, "%s%lu", neg ? "-" : "", lval);
2138 len = strlen(buf);
2139 if (len > left)
2140 len = left;
2141 if(copy_to_user(s, buf, len))
2142 return -EFAULT;
2143 left -= len;
2144 s += len;
2145 } 2277 }
2146 } 2278 }
2147 2279
2148 if (!write && !first && left) { 2280 if (!write && !first && left && !err)
2149 if(put_user('\n', s)) 2281 err = proc_put_char(&buffer, &left, '\n');
2150 return -EFAULT; 2282 if (write && !err)
2151 left--, s++; 2283 left -= proc_skip_spaces(&kbuf);
2152 } 2284free:
2153 if (write) { 2285 if (write) {
2154 while (left) { 2286 free_page(page);
2155 char c; 2287 if (first)
2156 if (get_user(c, s++)) 2288 return err ? : -EINVAL;
2157 return -EFAULT;
2158 if (!isspace(c))
2159 break;
2160 left--;
2161 }
2162 } 2289 }
2163 if (write && first)
2164 return -EINVAL;
2165 *lenp -= left; 2290 *lenp -= left;
2166 *ppos += *lenp; 2291 *ppos += *lenp;
2167 return 0; 2292 return err;
2168#undef TMPBUFLEN
2169} 2293}
2170 2294
2171static int do_proc_dointvec(struct ctl_table *table, int write, 2295static int do_proc_dointvec(struct ctl_table *table, int write,
2172 void __user *buffer, size_t *lenp, loff_t *ppos, 2296 void __user *buffer, size_t *lenp, loff_t *ppos,
2173 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2297 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2174 int write, void *data), 2298 int write, void *data),
2175 void *data) 2299 void *data)
2176{ 2300{
@@ -2238,8 +2362,8 @@ struct do_proc_dointvec_minmax_conv_param {
2238 int *max; 2362 int *max;
2239}; 2363};
2240 2364
2241static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 2365static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2242 int *valp, 2366 int *valp,
2243 int write, void *data) 2367 int write, void *data)
2244{ 2368{
2245 struct do_proc_dointvec_minmax_conv_param *param = data; 2369 struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,10 +2376,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2252 } else { 2376 } else {
2253 int val = *valp; 2377 int val = *valp;
2254 if (val < 0) { 2378 if (val < 0) {
2255 *negp = -1; 2379 *negp = true;
2256 *lvalp = (unsigned long)-val; 2380 *lvalp = (unsigned long)-val;
2257 } else { 2381 } else {
2258 *negp = 0; 2382 *negp = false;
2259 *lvalp = (unsigned long)val; 2383 *lvalp = (unsigned long)val;
2260 } 2384 }
2261 } 2385 }
@@ -2295,102 +2419,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2295 unsigned long convmul, 2419 unsigned long convmul,
2296 unsigned long convdiv) 2420 unsigned long convdiv)
2297{ 2421{
2298#define TMPBUFLEN 21 2422 unsigned long *i, *min, *max;
2299 unsigned long *i, *min, *max, val; 2423 int vleft, first = 1, err = 0;
2300 int vleft, first=1, neg; 2424 unsigned long page = 0;
2301 size_t len, left; 2425 size_t left;
2302 char buf[TMPBUFLEN], *p; 2426 char *kbuf;
2303 char __user *s = buffer; 2427
2304 2428 if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
2305 if (!data || !table->maxlen || !*lenp ||
2306 (*ppos && !write)) {
2307 *lenp = 0; 2429 *lenp = 0;
2308 return 0; 2430 return 0;
2309 } 2431 }
2310 2432
2311 i = (unsigned long *) data; 2433 i = (unsigned long *) data;
2312 min = (unsigned long *) table->extra1; 2434 min = (unsigned long *) table->extra1;
2313 max = (unsigned long *) table->extra2; 2435 max = (unsigned long *) table->extra2;
2314 vleft = table->maxlen / sizeof(unsigned long); 2436 vleft = table->maxlen / sizeof(unsigned long);
2315 left = *lenp; 2437 left = *lenp;
2316 2438
2439 if (write) {
2440 if (left > PAGE_SIZE - 1)
2441 left = PAGE_SIZE - 1;
2442 page = __get_free_page(GFP_TEMPORARY);
2443 kbuf = (char *) page;
2444 if (!kbuf)
2445 return -ENOMEM;
2446 if (copy_from_user(kbuf, buffer, left)) {
2447 err = -EFAULT;
2448 goto free;
2449 }
2450 kbuf[left] = 0;
2451 }
2452
2317 for (; left && vleft--; i++, min++, max++, first=0) { 2453 for (; left && vleft--; i++, min++, max++, first=0) {
2454 unsigned long val;
2455
2318 if (write) { 2456 if (write) {
2319 while (left) { 2457 bool neg;
2320 char c; 2458
2321 if (get_user(c, s)) 2459 left -= proc_skip_spaces(&kbuf);
2322 return -EFAULT; 2460
2323 if (!isspace(c)) 2461 err = proc_get_long(&kbuf, &left, &val, &neg,
2324 break; 2462 proc_wspace_sep,
2325 left--; 2463 sizeof(proc_wspace_sep), NULL);
2326 s++; 2464 if (err)
2327 }
2328 if (!left)
2329 break;
2330 neg = 0;
2331 len = left;
2332 if (len > TMPBUFLEN-1)
2333 len = TMPBUFLEN-1;
2334 if (copy_from_user(buf, s, len))
2335 return -EFAULT;
2336 buf[len] = 0;
2337 p = buf;
2338 if (*p == '-' && left > 1) {
2339 neg = 1;
2340 p++;
2341 }
2342 if (*p < '0' || *p > '9')
2343 break;
2344 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
2345 len = p-buf;
2346 if ((len < left) && *p && !isspace(*p))
2347 break; 2465 break;
2348 if (neg) 2466 if (neg)
2349 val = -val;
2350 s += len;
2351 left -= len;
2352
2353 if(neg)
2354 continue; 2467 continue;
2355 if ((min && val < *min) || (max && val > *max)) 2468 if ((min && val < *min) || (max && val > *max))
2356 continue; 2469 continue;
2357 *i = val; 2470 *i = val;
2358 } else { 2471 } else {
2359 p = buf; 2472 val = convdiv * (*i) / convmul;
2360 if (!first) 2473 if (!first)
2361 *p++ = '\t'; 2474 err = proc_put_char(&buffer, &left, '\t');
2362 sprintf(p, "%lu", convdiv * (*i) / convmul); 2475 err = proc_put_long(&buffer, &left, val, false);
2363 len = strlen(buf); 2476 if (err)
2364 if (len > left) 2477 break;
2365 len = left;
2366 if(copy_to_user(s, buf, len))
2367 return -EFAULT;
2368 left -= len;
2369 s += len;
2370 } 2478 }
2371 } 2479 }
2372 2480
2373 if (!write && !first && left) { 2481 if (!write && !first && left && !err)
2374 if(put_user('\n', s)) 2482 err = proc_put_char(&buffer, &left, '\n');
2375 return -EFAULT; 2483 if (write && !err)
2376 left--, s++; 2484 left -= proc_skip_spaces(&kbuf);
2377 } 2485free:
2378 if (write) { 2486 if (write) {
2379 while (left) { 2487 free_page(page);
2380 char c; 2488 if (first)
2381 if (get_user(c, s++)) 2489 return err ? : -EINVAL;
2382 return -EFAULT;
2383 if (!isspace(c))
2384 break;
2385 left--;
2386 }
2387 } 2490 }
2388 if (write && first)
2389 return -EINVAL;
2390 *lenp -= left; 2491 *lenp -= left;
2391 *ppos += *lenp; 2492 *ppos += *lenp;
2392 return 0; 2493 return err;
2393#undef TMPBUFLEN
2394} 2494}
2395 2495
2396static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2496static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2551,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2451} 2551}
2452 2552
2453 2553
2454static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, 2554static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2455 int *valp, 2555 int *valp,
2456 int write, void *data) 2556 int write, void *data)
2457{ 2557{
@@ -2463,10 +2563,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2463 int val = *valp; 2563 int val = *valp;
2464 unsigned long lval; 2564 unsigned long lval;
2465 if (val < 0) { 2565 if (val < 0) {
2466 *negp = -1; 2566 *negp = true;
2467 lval = (unsigned long)-val; 2567 lval = (unsigned long)-val;
2468 } else { 2568 } else {
2469 *negp = 0; 2569 *negp = false;
2470 lval = (unsigned long)val; 2570 lval = (unsigned long)val;
2471 } 2571 }
2472 *lvalp = lval / HZ; 2572 *lvalp = lval / HZ;
@@ -2474,7 +2574,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2474 return 0; 2574 return 0;
2475} 2575}
2476 2576
2477static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, 2577static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
2478 int *valp, 2578 int *valp,
2479 int write, void *data) 2579 int write, void *data)
2480{ 2580{
@@ -2486,10 +2586,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2486 int val = *valp; 2586 int val = *valp;
2487 unsigned long lval; 2587 unsigned long lval;
2488 if (val < 0) { 2588 if (val < 0) {
2489 *negp = -1; 2589 *negp = true;
2490 lval = (unsigned long)-val; 2590 lval = (unsigned long)-val;
2491 } else { 2591 } else {
2492 *negp = 0; 2592 *negp = false;
2493 lval = (unsigned long)val; 2593 lval = (unsigned long)val;
2494 } 2594 }
2495 *lvalp = jiffies_to_clock_t(lval); 2595 *lvalp = jiffies_to_clock_t(lval);
@@ -2497,7 +2597,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2497 return 0; 2597 return 0;
2498} 2598}
2499 2599
2500static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, 2600static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2501 int *valp, 2601 int *valp,
2502 int write, void *data) 2602 int write, void *data)
2503{ 2603{
@@ -2507,10 +2607,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2507 int val = *valp; 2607 int val = *valp;
2508 unsigned long lval; 2608 unsigned long lval;
2509 if (val < 0) { 2609 if (val < 0) {
2510 *negp = -1; 2610 *negp = true;
2511 lval = (unsigned long)-val; 2611 lval = (unsigned long)-val;
2512 } else { 2612 } else {
2513 *negp = 0; 2613 *negp = false;
2514 lval = (unsigned long)val; 2614 lval = (unsigned long)val;
2515 } 2615 }
2516 *lvalp = jiffies_to_msecs(lval); 2616 *lvalp = jiffies_to_msecs(lval);
@@ -2607,6 +2707,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
2607 return 0; 2707 return 0;
2608} 2708}
2609 2709
2710/**
2711 * proc_do_large_bitmap - read/write from/to a large bitmap
2712 * @table: the sysctl table
2713 * @write: %TRUE if this is a write to the sysctl file
2714 * @buffer: the user buffer
2715 * @lenp: the size of the user buffer
2716 * @ppos: file position
2717 *
2718 * The bitmap is stored at table->data and the bitmap length (in bits)
2719 * in table->maxlen.
2720 *
2721 * We use a range comma separated format (e.g. 1,3-4,10-10) so that
2722 * large bitmaps may be represented in a compact manner. Writing into
2723 * the file will clear the bitmap then update it with the given input.
2724 *
2725 * Returns 0 on success.
2726 */
2727int proc_do_large_bitmap(struct ctl_table *table, int write,
2728 void __user *buffer, size_t *lenp, loff_t *ppos)
2729{
2730 int err = 0;
2731 bool first = 1;
2732 size_t left = *lenp;
2733 unsigned long bitmap_len = table->maxlen;
2734 unsigned long *bitmap = (unsigned long *) table->data;
2735 unsigned long *tmp_bitmap = NULL;
2736 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2737
2738 if (!bitmap_len || !left || (*ppos && !write)) {
2739 *lenp = 0;
2740 return 0;
2741 }
2742
2743 if (write) {
2744 unsigned long page = 0;
2745 char *kbuf;
2746
2747 if (left > PAGE_SIZE - 1)
2748 left = PAGE_SIZE - 1;
2749
2750 page = __get_free_page(GFP_TEMPORARY);
2751 kbuf = (char *) page;
2752 if (!kbuf)
2753 return -ENOMEM;
2754 if (copy_from_user(kbuf, buffer, left)) {
2755 free_page(page);
2756 return -EFAULT;
2757 }
2758 kbuf[left] = 0;
2759
2760 tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
2761 GFP_KERNEL);
2762 if (!tmp_bitmap) {
2763 free_page(page);
2764 return -ENOMEM;
2765 }
2766 proc_skip_char(&kbuf, &left, '\n');
2767 while (!err && left) {
2768 unsigned long val_a, val_b;
2769 bool neg;
2770
2771 err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
2772 sizeof(tr_a), &c);
2773 if (err)
2774 break;
2775 if (val_a >= bitmap_len || neg) {
2776 err = -EINVAL;
2777 break;
2778 }
2779
2780 val_b = val_a;
2781 if (left) {
2782 kbuf++;
2783 left--;
2784 }
2785
2786 if (c == '-') {
2787 err = proc_get_long(&kbuf, &left, &val_b,
2788 &neg, tr_b, sizeof(tr_b),
2789 &c);
2790 if (err)
2791 break;
2792 if (val_b >= bitmap_len || neg ||
2793 val_a > val_b) {
2794 err = -EINVAL;
2795 break;
2796 }
2797 if (left) {
2798 kbuf++;
2799 left--;
2800 }
2801 }
2802
2803 while (val_a <= val_b)
2804 set_bit(val_a++, tmp_bitmap);
2805
2806 first = 0;
2807 proc_skip_char(&kbuf, &left, '\n');
2808 }
2809 free_page(page);
2810 } else {
2811 unsigned long bit_a, bit_b = 0;
2812
2813 while (left) {
2814 bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
2815 if (bit_a >= bitmap_len)
2816 break;
2817 bit_b = find_next_zero_bit(bitmap, bitmap_len,
2818 bit_a + 1) - 1;
2819
2820 if (!first) {
2821 err = proc_put_char(&buffer, &left, ',');
2822 if (err)
2823 break;
2824 }
2825 err = proc_put_long(&buffer, &left, bit_a, false);
2826 if (err)
2827 break;
2828 if (bit_a != bit_b) {
2829 err = proc_put_char(&buffer, &left, '-');
2830 if (err)
2831 break;
2832 err = proc_put_long(&buffer, &left, bit_b, false);
2833 if (err)
2834 break;
2835 }
2836
2837 first = 0; bit_b++;
2838 }
2839 if (!err)
2840 err = proc_put_char(&buffer, &left, '\n');
2841 }
2842
2843 if (!err) {
2844 if (write) {
2845 if (*ppos)
2846 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2847 else
2848 memcpy(bitmap, tmp_bitmap,
2849 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2850 }
2851 kfree(tmp_bitmap);
2852 *lenp -= left;
2853 *ppos += *lenp;
2854 return 0;
2855 } else {
2856 kfree(tmp_bitmap);
2857 return err;
2858 }
2859}
2860
2610#else /* CONFIG_PROC_FS */ 2861#else /* CONFIG_PROC_FS */
2611 2862
2612int proc_dostring(struct ctl_table *table, int write, 2863int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 59030570f5ca..937d31dc8566 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -224,7 +224,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, 224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, 225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, 226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
227 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
228 {} 227 {}
229}; 228};
230 229
diff --git a/kernel/time.c b/kernel/time.c
index 656dccfe1cbb..50612faa9baf 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
132 */ 132 */
133static inline void warp_clock(void) 133static inline void warp_clock(void)
134{ 134{
135 write_seqlock_irq(&xtime_lock); 135 struct timespec delta, adjust;
136 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136 delta.tv_sec = sys_tz.tz_minuteswest * 60;
137 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 delta.tv_nsec = 0;
138 update_xtime_cache(0); 138 adjust = timespec_add_safe(current_kernel_time(), delta);
139 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
140 clock_was_set();
141} 140}
142 141
143/* 142/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
626} 626}
627 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_register_scale - Used to install new clocksources
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * Returns -EBUSY if registration fails, zero otherwise.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions.
651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654
655 /*
656 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed
658 * MAX_UPDATE_LENGTH. But for now this just gets the
659 * register interface working properly.
660 */
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs);
665
666 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs);
668 clocksource_select();
669 clocksource_enqueue_watchdog(cs);
670 mutex_unlock(&clocksource_mutex);
671 return 0;
672}
673EXPORT_SYMBOL_GPL(__clocksource_register_scale);
674
675
628/** 676/**
629 * clocksource_register - Used to install new clocksources 677 * clocksource_register - Used to install new clocksources
630 * @t: clocksource to be registered 678 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 39f6177fafac..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
559 } 550 }
560 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
595 } 585 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -788,7 +777,6 @@ void update_wall_time(void)
788{ 777{
789 struct clocksource *clock; 778 struct clocksource *clock;
790 cycle_t offset; 779 cycle_t offset;
791 u64 nsecs;
792 int shift = 0, maxshift; 780 int shift = 0, maxshift;
793 781
794 /* Make sure we're fully resumed: */ 782 /* Make sure we're fully resumed: */
@@ -847,7 +835,9 @@ void update_wall_time(void)
847 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 835 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
848 } 836 }
849 837
850 /* store full nanoseconds into xtime after rounding it up and 838
839 /*
840 * Store full nanoseconds into xtime after rounding it up and
851 * add the remainder to the error difference. 841 * add the remainder to the error difference.
852 */ 842 */
853 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 843 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -855,8 +845,15 @@ void update_wall_time(void)
855 timekeeper.ntp_error += timekeeper.xtime_nsec << 845 timekeeper.ntp_error += timekeeper.xtime_nsec <<
856 timekeeper.ntp_error_shift; 846 timekeeper.ntp_error_shift;
857 847
858 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 848 /*
859 update_xtime_cache(nsecs); 849 * Finally, make sure that after the rounding
850 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
851 */
852 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
853 xtime.tv_nsec -= NSEC_PER_SEC;
854 xtime.tv_sec++;
855 second_overflow();
856 }
860 857
861 /* check to see if there is a new clocksource to use */ 858 /* check to see if there is a new clocksource to use */
862 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -896,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
896 893
897unsigned long get_seconds(void) 894unsigned long get_seconds(void)
898{ 895{
899 return xtime_cache.tv_sec; 896 return xtime.tv_sec;
900} 897}
901EXPORT_SYMBOL(get_seconds); 898EXPORT_SYMBOL(get_seconds);
902 899
903struct timespec __current_kernel_time(void) 900struct timespec __current_kernel_time(void)
904{ 901{
905 return xtime_cache; 902 return xtime;
906} 903}
907 904
908struct timespec current_kernel_time(void) 905struct timespec current_kernel_time(void)
@@ -913,7 +910,7 @@ struct timespec current_kernel_time(void)
913 do { 910 do {
914 seq = read_seqbegin(&xtime_lock); 911 seq = read_seqbegin(&xtime_lock);
915 912
916 now = xtime_cache; 913 now = xtime;
917 } while (read_seqretry(&xtime_lock, seq)); 914 } while (read_seqretry(&xtime_lock, seq));
918 915
919 return now; 916 return now;
@@ -928,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
928 do { 925 do {
929 seq = read_seqbegin(&xtime_lock); 926 seq = read_seqbegin(&xtime_lock);
930 927
931 now = xtime_cache; 928 now = xtime;
932 mono = wall_to_monotonic; 929 mono = wall_to_monotonic;
933 } while (read_seqretry(&xtime_lock, seq)); 930 } while (read_seqretry(&xtime_lock, seq));
934 931
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..9199f3c52215 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -319,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
319} 319}
320EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 *
326 * Set the amount of time, in jiffies, that a certain timer has
327 * in terms of slack. By setting this value, the timer subsystem
328 * will schedule the actual timer somewhere between
329 * the time mod_timer() asks for, and that time plus the slack.
330 *
331 * By setting the slack to -1, a percentage of the delay is used
332 * instead.
333 */
334void set_timer_slack(struct timer_list *timer, int slack_hz)
335{
336 timer->slack = slack_hz;
337}
338EXPORT_SYMBOL_GPL(set_timer_slack);
339
322 340
323static inline void set_running_timer(struct tvec_base *base, 341static inline void set_running_timer(struct tvec_base *base,
324 struct timer_list *timer) 342 struct timer_list *timer)
@@ -550,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
550{ 568{
551 timer->entry.next = NULL; 569 timer->entry.next = NULL;
552 timer->base = __raw_get_cpu_var(tvec_bases); 570 timer->base = __raw_get_cpu_var(tvec_bases);
571 timer->slack = -1;
553#ifdef CONFIG_TIMER_STATS 572#ifdef CONFIG_TIMER_STATS
554 timer->start_site = NULL; 573 timer->start_site = NULL;
555 timer->start_pid = -1; 574 timer->start_pid = -1;
@@ -715,6 +734,41 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
715} 734}
716EXPORT_SYMBOL(mod_timer_pending); 735EXPORT_SYMBOL(mod_timer_pending);
717 736
737/*
738 * Decide where to put the timer while taking the slack into account
739 *
740 * Algorithm:
741 * 1) calculate the maximum (absolute) time
742 * 2) calculate the highest bit where the expires and new max are different
743 * 3) use this bit to make a mask
744 * 4) use the bitmask to round down the maximum time, so that all last
745 * bits are zeros
746 */
747static inline
748unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749{
750 unsigned long expires_limit, mask;
751 int bit;
752
753 expires_limit = expires + timer->slack;
754
755 if (timer->slack < 0) /* auto slack: use 0.4% */
756 expires_limit = expires + (expires - jiffies)/256;
757
758 mask = expires ^ expires_limit;
759
760 if (mask == 0)
761 return expires;
762
763 bit = find_last_bit(&mask, BITS_PER_LONG);
764
765 mask = (1 << bit) - 1;
766
767 expires_limit = expires_limit & ~(mask);
768
769 return expires_limit;
770}
771
718/** 772/**
719 * mod_timer - modify a timer's timeout 773 * mod_timer - modify a timer's timeout
720 * @timer: the timer to be modified 774 * @timer: the timer to be modified
@@ -745,6 +799,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
745 if (timer_pending(timer) && timer->expires == expires) 799 if (timer_pending(timer) && timer->expires == expires)
746 return 1; 800 return 1;
747 801
802 expires = apply_slack(timer, expires);
803
748 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 804 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
749} 805}
750EXPORT_SYMBOL(mod_timer); 806EXPORT_SYMBOL(mod_timer);
@@ -955,6 +1011,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
955 return index; 1011 return index;
956} 1012}
957 1013
1014static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1015 unsigned long data)
1016{
1017 int preempt_count = preempt_count();
1018
1019#ifdef CONFIG_LOCKDEP
1020 /*
1021 * It is permissible to free the timer from inside the
1022 * function that is called from it, this we need to take into
1023 * account for lockdep too. To avoid bogus "held lock freed"
1024 * warnings as well as problems when looking into
1025 * timer->lockdep_map, make a copy and use that here.
1026 */
1027 struct lockdep_map lockdep_map = timer->lockdep_map;
1028#endif
1029 /*
1030 * Couple the lock chain with the lock chain at
1031 * del_timer_sync() by acquiring the lock_map around the fn()
1032 * call here and in del_timer_sync().
1033 */
1034 lock_map_acquire(&lockdep_map);
1035
1036 trace_timer_expire_entry(timer);
1037 fn(data);
1038 trace_timer_expire_exit(timer);
1039
1040 lock_map_release(&lockdep_map);
1041
1042 if (preempt_count != preempt_count()) {
1043 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1044 fn, preempt_count, preempt_count());
1045 /*
1046 * Restore the preempt count. That gives us a decent
1047 * chance to survive and extract information. If the
1048 * callback kept a lock held, bad luck, but not worse
1049 * than the BUG() we had.
1050 */
1051 preempt_count() = preempt_count;
1052 }
1053}
1054
958#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1055#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
959 1056
960/** 1057/**
@@ -998,45 +1095,7 @@ static inline void __run_timers(struct tvec_base *base)
998 detach_timer(timer, 1); 1095 detach_timer(timer, 1);
999 1096
1000 spin_unlock_irq(&base->lock); 1097 spin_unlock_irq(&base->lock);
1001 { 1098 call_timer_fn(timer, fn, data);
1002 int preempt_count = preempt_count();
1003
1004#ifdef CONFIG_LOCKDEP
1005 /*
1006 * It is permissible to free the timer from
1007 * inside the function that is called from
1008 * it, this we need to take into account for
1009 * lockdep too. To avoid bogus "held lock
1010 * freed" warnings as well as problems when
1011 * looking into timer->lockdep_map, make a
1012 * copy and use that here.
1013 */
1014 struct lockdep_map lockdep_map =
1015 timer->lockdep_map;
1016#endif
1017 /*
1018 * Couple the lock chain with the lock chain at
1019 * del_timer_sync() by acquiring the lock_map
1020 * around the fn() call here and in
1021 * del_timer_sync().
1022 */
1023 lock_map_acquire(&lockdep_map);
1024
1025 trace_timer_expire_entry(timer);
1026 fn(data);
1027 trace_timer_expire_exit(timer);
1028
1029 lock_map_release(&lockdep_map);
1030
1031 if (preempt_count != preempt_count()) {
1032 printk(KERN_ERR "huh, entered %p "
1033 "with preempt_count %08x, exited"
1034 " with %08x?\n",
1035 fn, preempt_count,
1036 preempt_count());
1037 BUG();
1038 }
1039 }
1040 spin_lock_irq(&base->lock); 1099 spin_lock_irq(&base->lock);
1041 } 1100 }
1042 } 1101 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 2404c129a8c9..ab13d7008061 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
209 209
210 return 1; 210 return 1;
211} 211}
212EXPORT_SYMBOL(trace_seq_putc);
212 213
213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 214int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
214{ 215{
@@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
355} 356}
356EXPORT_SYMBOL(ftrace_print_symbols_seq); 357EXPORT_SYMBOL(ftrace_print_symbols_seq);
357 358
359const char *
360ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
361{
362 int i;
363 const char *ret = p->buffer + p->len;
364
365 for (i = 0; i < buf_len; i++)
366 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
367
368 trace_seq_putc(p, 0);
369
370 return ret;
371}
372EXPORT_SYMBOL(ftrace_print_hex_seq);
373
358#ifdef CONFIG_KRETPROBES 374#ifdef CONFIG_KRETPROBES
359static inline const char *kretprobed(const char *name) 375static inline const char *kretprobed(const char *name)
360{ 376{
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..b2d70d38dff4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -54,8 +54,8 @@ int create_user_ns(struct cred *new)
54#endif 54#endif
55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
56 56
57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */ 57 /* root_user holds a reference to ns, our reference can be dropped */
58 kref_set(&ns->kref, 1); 58 put_user_ns(ns);
59 59
60 return 0; 60 return 0;
61} 61}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5bfb213984b2..77dabbf64b8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
229 atomic_long_set(&work->data, new); 229 atomic_long_set(&work->data, new);
230} 230}
231 231
232/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
234 */
235static inline void clear_wq_data(struct work_struct *work)
236{
237 unsigned long flags = *work_data_bits(work) &
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240}
241
232static inline 242static inline
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
234{ 244{
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 681 wait_on_work(work);
672 } while (unlikely(ret < 0)); 682 } while (unlikely(ret < 0));
673 683
674 work_clear_pending(work); 684 clear_wq_data(work);
675 return ret; 685 return ret;
676} 686}
677 687
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 855 return 0;
846} 856}
847 857
858/**
859 * flush_scheduled_work - ensure that any scheduled work has run to completion.
860 *
861 * Forces execution of the kernel-global workqueue and blocks until its
862 * completion.
863 *
864 * Think twice before calling this function! It's very easy to get into
865 * trouble if you don't take great care. Either of the following situations
866 * will lead to deadlock:
867 *
868 * One of the work items currently on the workqueue needs to acquire
869 * a lock held by your code or its caller.
870 *
871 * Your code is running in the context of a work routine.
872 *
873 * They will be detected by lockdep when they occur, but the first might not
874 * occur very often. It depends on what work items are on the workqueue and
875 * what locks they need, which you have no control over.
876 *
877 * In most situations flushing the entire workqueue is overkill; you merely
878 * need to know that a particular work item isn't queued and isn't running.
879 * In such cases you should use cancel_delayed_work_sync() or
880 * cancel_work_sync() instead.
881 */
848void flush_scheduled_work(void) 882void flush_scheduled_work(void)
849{ 883{
850 flush_workqueue(keventd_wq); 884 flush_workqueue(keventd_wq);