aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/gpio.txt17
-rw-r--r--Documentation/hrtimer/timer_stats.txt68
-rw-r--r--Documentation/hrtimers/highres.txt249
-rw-r--r--Documentation/hrtimers/hrtimers.txt (renamed from Documentation/hrtimers.txt)0
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--arch/arm/kernel/irq.c3
-rw-r--r--arch/arm/mach-imx/time.c2
-rw-r--r--arch/arm/mach-ixp4xx/common.c2
-rw-r--r--arch/arm/mach-netx/time.c2
-rw-r--r--arch/arm/mach-pxa/time.c2
-rw-r--r--arch/avr32/kernel/time.c2
-rw-r--r--arch/i386/Kconfig16
-rw-r--r--arch/i386/kernel/Makefile3
-rw-r--r--arch/i386/kernel/acpi/boot.c25
-rw-r--r--arch/i386/kernel/apic.c1629
-rw-r--r--arch/i386/kernel/apm.c44
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/i386/kernel/cpu/cpufreq/e_powersaver.c334
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.c359
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.h153
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k8.c6
-rw-r--r--arch/i386/kernel/hpet.c498
-rw-r--r--arch/i386/kernel/i8253.c96
-rw-r--r--arch/i386/kernel/i8259.c7
-rw-r--r--arch/i386/kernel/io_apic.c10
-rw-r--r--arch/i386/kernel/irq.c22
-rw-r--r--arch/i386/kernel/nmi.c9
-rw-r--r--arch/i386/kernel/process.c3
-rw-r--r--arch/i386/kernel/smpboot.c187
-rw-r--r--arch/i386/kernel/time.c124
-rw-r--r--arch/i386/kernel/tsc.c169
-rw-r--r--arch/i386/kernel/tsc_sync.c1
-rw-r--r--arch/i386/kernel/vmitime.c2
-rw-r--r--arch/i386/mach-default/setup.c8
-rw-r--r--arch/mips/kernel/time.c2
-rw-r--r--arch/powerpc/platforms/powermac/pic.c2
-rw-r--r--arch/s390/kernel/time.c2
-rw-r--r--arch/um/os-Linux/sigio.c38
-rw-r--r--arch/x86_64/Kconfig8
-rw-r--r--arch/x86_64/kernel/Makefile4
-rw-r--r--arch/x86_64/kernel/apic.c5
-rw-r--r--arch/x86_64/kernel/hpet.c (renamed from arch/i386/kernel/time_hpet.c)406
-rw-r--r--arch/x86_64/kernel/i8259.c1
-rw-r--r--arch/x86_64/kernel/io_apic.c4
-rw-r--r--arch/x86_64/kernel/pmtimer.c58
-rw-r--r--arch/x86_64/kernel/smpboot.c231
-rw-r--r--arch/x86_64/kernel/time.c961
-rw-r--r--arch/x86_64/kernel/tsc.c226
-rw-r--r--arch/x86_64/kernel/tsc_sync.c187
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S28
-rw-r--r--arch/x86_64/kernel/vsyscall.c121
-rw-r--r--drivers/acpi/processor_idle.c114
-rw-r--r--drivers/char/agp/Makefile1
-rw-r--r--drivers/char/agp/agp.h12
-rw-r--r--drivers/char/agp/ali-agp.c2
-rw-r--r--drivers/char/agp/alpha-agp.c4
-rw-r--r--drivers/char/agp/amd-k7-agp.c1
-rw-r--r--drivers/char/agp/amd64-agp.c11
-rw-r--r--drivers/char/agp/ati-agp.c1
-rw-r--r--drivers/char/agp/backend.c2
-rw-r--r--drivers/char/agp/compat_ioctl.c282
-rw-r--r--drivers/char/agp/compat_ioctl.h105
-rw-r--r--drivers/char/agp/efficeon-agp.c1
-rw-r--r--drivers/char/agp/frontend.c34
-rw-r--r--drivers/char/agp/generic.c125
-rw-r--r--drivers/char/agp/hp-agp.c1
-rw-r--r--drivers/char/agp/i460-agp.c7
-rw-r--r--drivers/char/agp/intel-agp.c202
-rw-r--r--drivers/char/agp/nvidia-agp.c1
-rw-r--r--drivers/char/agp/parisc-agp.c1
-rw-r--r--drivers/char/agp/sgi-agp.c1
-rw-r--r--drivers/char/agp/sis-agp.c1
-rw-r--r--drivers/char/agp/sworks-agp.c1
-rw-r--r--drivers/char/agp/uninorth-agp.c2
-rw-r--r--drivers/char/agp/via-agp.c2
-rw-r--r--drivers/char/hangcheck-timer.c2
-rw-r--r--drivers/char/sysrq.c14
-rw-r--r--drivers/clocksource/acpi_pm.c20
-rw-r--r--drivers/clocksource/cyclone.c2
-rw-r--r--drivers/clocksource/scx200_hrt.c2
-rw-r--r--drivers/cpufreq/Kconfig2
-rw-r--r--drivers/cpufreq/cpufreq.c258
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c2
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c64
-rw-r--r--drivers/cpufreq/cpufreq_stats.c2
-rw-r--r--drivers/cpufreq/cpufreq_userspace.c2
-rw-r--r--drivers/input/touchscreen/ads7846.c11
-rw-r--r--drivers/isdn/gigaset/Makefile2
-rw-r--r--drivers/video/s3c2410fb.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/keystore.c26
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/namei.c3
-rw-r--r--fs/nfsd/nfs4acl.c491
-rw-r--r--fs/nfsd/nfs4callback.c7
-rw-r--r--fs/nfsd/nfs4xdr.c55
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--include/acpi/processor.h1
-rw-r--r--include/asm-i386/apic.h9
-rw-r--r--include/asm-i386/hpet.h16
-rw-r--r--include/asm-i386/i8253.h15
-rw-r--r--include/asm-i386/mach-default/do_timer.h78
-rw-r--r--include/asm-i386/mach-voyager/do_timer.h27
-rw-r--r--include/asm-i386/mpspec.h1
-rw-r--r--include/asm-i386/msr.h3
-rw-r--r--include/asm-i386/tsc.h49
-rw-r--r--include/asm-x86_64/hpet.h7
-rw-r--r--include/asm-x86_64/proto.h6
-rw-r--r--include/asm-x86_64/timex.h35
-rw-r--r--include/asm-x86_64/tsc.h66
-rw-r--r--include/asm-x86_64/vsyscall.h29
-rw-r--r--include/linux/acpi_pmtmr.h38
-rw-r--r--include/linux/agp_backend.h5
-rw-r--r--include/linux/clockchips.h142
-rw-r--r--include/linux/clocksource.h39
-rw-r--r--include/linux/cpufreq.h10
-rw-r--r--include/linux/hardirq.h9
-rw-r--r--include/linux/hrtimer.h260
-rw-r--r--include/linux/interrupt.h6
-rw-r--r--include/linux/irq.h52
-rw-r--r--include/linux/jiffies.h222
-rw-r--r--include/linux/ktime.h3
-rw-r--r--include/linux/nfs4.h3
-rw-r--r--include/linux/nfs4_acl.h9
-rw-r--r--include/linux/tick.h109
-rw-r--r--include/linux/time.h1
-rw-r--r--include/linux/timer.h66
-rw-r--r--include/linux/timex.h7
-rw-r--r--init/main.c2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c824
-rw-r--r--kernel/irq/chip.c25
-rw-r--r--kernel/irq/manage.c44
-rw-r--r--kernel/irq/proc.c24
-rw-r--r--kernel/itimer.c18
-rw-r--r--kernel/posix-cpu-timers.c15
-rw-r--r--kernel/posix-timers.c15
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/signal.c58
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/time.c254
-rw-r--r--kernel/time/Kconfig25
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/clockevents.c345
-rw-r--r--kernel/time/clocksource.c246
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c480
-rw-r--r--kernel/time/tick-common.c346
-rw-r--r--kernel/time/tick-internal.h110
-rw-r--r--kernel/time/tick-oneshot.c84
-rw-r--r--kernel/time/tick-sched.c563
-rw-r--r--kernel/time/timer_list.c287
-rw-r--r--kernel/time/timer_stats.c411
-rw-r--r--kernel/timer.c286
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/workqueue.c7
-rw-r--r--lib/Kconfig.debug11
-rw-r--r--lib/devres.c8
-rw-r--r--mm/filemap.c32
163 files changed, 9612 insertions, 4547 deletions
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 09dd510c4a5f..576ce463cf44 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -78,7 +78,8 @@ Identifying GPIOs
78----------------- 78-----------------
79GPIOs are identified by unsigned integers in the range 0..MAX_INT. That 79GPIOs are identified by unsigned integers in the range 0..MAX_INT. That
80reserves "negative" numbers for other purposes like marking signals as 80reserves "negative" numbers for other purposes like marking signals as
81"not available on this board", or indicating faults. 81"not available on this board", or indicating faults. Code that doesn't
82touch the underlying hardware treats these integers as opaque cookies.
82 83
83Platforms define how they use those integers, and usually #define symbols 84Platforms define how they use those integers, and usually #define symbols
84for the GPIO lines so that board-specific setup code directly corresponds 85for the GPIO lines so that board-specific setup code directly corresponds
@@ -139,10 +140,10 @@ issues including wire-OR and output latencies.
139The get/set calls have no error returns because "invalid GPIO" should have 140The get/set calls have no error returns because "invalid GPIO" should have
140been reported earlier in gpio_set_direction(). However, note that not all 141been reported earlier in gpio_set_direction(). However, note that not all
141platforms can read the value of output pins; those that can't should always 142platforms can read the value of output pins; those that can't should always
142return zero. Also, these calls will be ignored for GPIOs that can't safely 143return zero. Also, using these calls for GPIOs that can't safely be accessed
143be accessed wihtout sleeping (see below). 144without sleeping (see below) is an error.
144 145
145Platform-specific implementations are encouraged to optimise the two 146Platform-specific implementations are encouraged to optimize the two
146calls to access the GPIO value in cases where the GPIO number (and for 147calls to access the GPIO value in cases where the GPIO number (and for
147output, value) are constant. It's normal for them to need only a couple 148output, value) are constant. It's normal for them to need only a couple
148of instructions in such cases (reading or writing a hardware register), 149of instructions in such cases (reading or writing a hardware register),
@@ -239,7 +240,8 @@ options are part of the IRQ interface, e.g. IRQF_TRIGGER_FALLING, as are
239system wakeup capabilities. 240system wakeup capabilities.
240 241
241Non-error values returned from irq_to_gpio() would most commonly be used 242Non-error values returned from irq_to_gpio() would most commonly be used
242with gpio_get_value(). 243with gpio_get_value(), for example to initialize or update driver state
244when the IRQ is edge-triggered.
243 245
244 246
245 247
@@ -260,9 +262,10 @@ pullups (or pulldowns) so that the on-chip ones should not be used.
260There are other system-specific mechanisms that are not specified here, 262There are other system-specific mechanisms that are not specified here,
261like the aforementioned options for input de-glitching and wire-OR output. 263like the aforementioned options for input de-glitching and wire-OR output.
262Hardware may support reading or writing GPIOs in gangs, but that's usually 264Hardware may support reading or writing GPIOs in gangs, but that's usually
263configuration dependednt: for GPIOs sharing the same bank. (GPIOs are 265configuration dependent: for GPIOs sharing the same bank. (GPIOs are
264commonly grouped in banks of 16 or 32, with a given SOC having several such 266commonly grouped in banks of 16 or 32, with a given SOC having several such
265banks.) Code relying on such mechanisms will necessarily be nonportable. 267banks.) Some systems can trigger IRQs from output GPIOs. Code relying on
268such mechanisms will necessarily be nonportable.
266 269
267Dynamic definition of GPIOs is not currently supported; for example, as 270Dynamic definition of GPIOs is not currently supported; for example, as
268a side effect of configuring an add-on board with some GPIO expanders. 271a side effect of configuring an add-on board with some GPIO expanders.
diff --git a/Documentation/hrtimer/timer_stats.txt b/Documentation/hrtimer/timer_stats.txt
new file mode 100644
index 000000000000..27f782e3593f
--- /dev/null
+++ b/Documentation/hrtimer/timer_stats.txt
@@ -0,0 +1,68 @@
1timer_stats - timer usage statistics
2------------------------------------
3
4timer_stats is a debugging facility to make the timer (ab)usage in a Linux
5system visible to kernel and userspace developers. It is not intended for
6production usage as it adds significant overhead to the (hr)timer code and the
7(hr)timer data structures.
8
9timer_stats should be used by kernel and userspace developers to verify that
10their code does not make unduly use of timers. This helps to avoid unnecessary
11wakeups, which should be avoided to optimize power consumption.
12
13It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration
14section.
15
16timer_stats collects information about the timer events which are fired in a
17Linux system over a sample period:
18
19- the pid of the task(process) which initialized the timer
20- the name of the process which initialized the timer
21- the function where the timer was intialized
22- the callback function which is associated to the timer
23- the number of events (callbacks)
24
25timer_stats adds an entry to /proc: /proc/timer_stats
26
27This entry is used to control the statistics functionality and to read out the
28sampled information.
29
30The timer_stats functionality is inactive on bootup.
31
32To activate a sample period issue:
33# echo 1 >/proc/timer_stats
34
35To stop a sample period issue:
36# echo 0 >/proc/timer_stats
37
38The statistics can be retrieved by:
39# cat /proc/timer_stats
40
41The readout of /proc/timer_stats automatically disables sampling. The sampled
42information is kept until a new sample period is started. This allows multiple
43readouts.
44
45Sample output of /proc/timer_stats:
46
47Timerstats sample period: 3.888770 s
48 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
49 15, 1 swapper hcd_submit_urb (rh_timer_func)
50 4, 959 kedac schedule_timeout (process_timeout)
51 1, 0 swapper page_writeback_init (wb_timer_fn)
52 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
53 22, 2948 IRQ 4 tty_flip_buffer_push (delayed_work_timer_fn)
54 3, 3100 bash schedule_timeout (process_timeout)
55 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
56 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
57 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
58 1, 2292 ip __netdev_watchdog_up (dev_watchdog)
59 1, 23 events/1 do_cache_clean (delayed_work_timer_fn)
6090 total events, 30.0 events/sec
61
62The first column is the number of events, the second column the pid, the third
63column is the name of the process. The forth column shows the function which
64initialized the timer and in parantheses the callback function which was
65executed on expiry.
66
67 Thomas, Ingo
68
diff --git a/Documentation/hrtimers/highres.txt b/Documentation/hrtimers/highres.txt
new file mode 100644
index 000000000000..ce0e9a91e157
--- /dev/null
+++ b/Documentation/hrtimers/highres.txt
@@ -0,0 +1,249 @@
1High resolution timers and dynamic ticks design notes
2-----------------------------------------------------
3
4Further information can be found in the paper of the OLS 2006 talk "hrtimers
5and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can
6be found on the OLS website:
7http://www.linuxsymposium.org/2006/linuxsymposium_procv1.pdf
8
9The slides to this talk are available from:
10http://tglx.de/projects/hrtimers/ols2006-hrtimers.pdf
11
12The slides contain five figures (pages 2, 15, 18, 20, 22), which illustrate the
13changes in the time(r) related Linux subsystems. Figure #1 (p. 2) shows the
14design of the Linux time(r) system before hrtimers and other building blocks
15got merged into mainline.
16
17Note: the paper and the slides are talking about "clock event source", while we
18switched to the name "clock event devices" in meantime.
19
20The design contains the following basic building blocks:
21
22- hrtimer base infrastructure
23- timeofday and clock source management
24- clock event management
25- high resolution timer functionality
26- dynamic ticks
27
28
29hrtimer base infrastructure
30---------------------------
31
32The hrtimer base infrastructure was merged into the 2.6.16 kernel. Details of
33the base implementation are covered in Documentation/hrtimers/hrtimer.txt. See
34also figure #2 (OLS slides p. 15)
35
36The main differences to the timer wheel, which holds the armed timer_list type
37timers are:
38 - time ordered enqueueing into a rb-tree
39 - independent of ticks (the processing is based on nanoseconds)
40
41
42timeofday and clock source management
43-------------------------------------
44
45John Stultz's Generic Time Of Day (GTOD) framework moves a large portion of
46code out of the architecture-specific areas into a generic management
47framework, as illustrated in figure #3 (OLS slides p. 18). The architecture
48specific portion is reduced to the low level hardware details of the clock
49sources, which are registered in the framework and selected on a quality based
50decision. The low level code provides hardware setup and readout routines and
51initializes data structures, which are used by the generic time keeping code to
52convert the clock ticks to nanosecond based time values. All other time keeping
53related functionality is moved into the generic code. The GTOD base patch got
54merged into the 2.6.18 kernel.
55
56Further information about the Generic Time Of Day framework is available in the
57OLS 2005 Proceedings Volume 1:
58http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf
59
60The paper "We Are Not Getting Any Younger: A New Approach to Time and
61Timers" was written by J. Stultz, D.V. Hart, & N. Aravamudan.
62
63Figure #3 (OLS slides p.18) illustrates the transformation.
64
65
66clock event management
67----------------------
68
69While clock sources provide read access to the monotonically increasing time
70value, clock event devices are used to schedule the next event
71interrupt(s). The next event is currently defined to be periodic, with its
72period defined at compile time. The setup and selection of the event device
73for various event driven functionalities is hardwired into the architecture
74dependent code. This results in duplicated code across all architectures and
75makes it extremely difficult to change the configuration of the system to use
76event interrupt devices other than those already built into the
77architecture. Another implication of the current design is that it is necessary
78to touch all the architecture-specific implementations in order to provide new
79functionality like high resolution timers or dynamic ticks.
80
81The clock events subsystem tries to address this problem by providing a generic
82solution to manage clock event devices and their usage for the various clock
83event driven kernel functionalities. The goal of the clock event subsystem is
84to minimize the clock event related architecture dependent code to the pure
85hardware related handling and to allow easy addition and utilization of new
86clock event devices. It also minimizes the duplicated code across the
87architectures as it provides generic functionality down to the interrupt
88service handler, which is almost inherently hardware dependent.
89
90Clock event devices are registered either by the architecture dependent boot
91code or at module insertion time. Each clock event device fills a data
92structure with clock-specific property parameters and callback functions. The
93clock event management decides, by using the specified property parameters, the
94set of system functions a clock event device will be used to support. This
95includes the distinction of per-CPU and per-system global event devices.
96
97System-level global event devices are used for the Linux periodic tick. Per-CPU
98event devices are used to provide local CPU functionality such as process
99accounting, profiling, and high resolution timers.
100
101The management layer assignes one or more of the folliwing functions to a clock
102event device:
103 - system global periodic tick (jiffies update)
104 - cpu local update_process_times
105 - cpu local profiling
106 - cpu local next event interrupt (non periodic mode)
107
108The clock event device delegates the selection of those timer interrupt related
109functions completely to the management layer. The clock management layer stores
110a function pointer in the device description structure, which has to be called
111from the hardware level handler. This removes a lot of duplicated code from the
112architecture specific timer interrupt handlers and hands the control over the
113clock event devices and the assignment of timer interrupt related functionality
114to the core code.
115
116The clock event layer API is rather small. Aside from the clock event device
117registration interface it provides functions to schedule the next event
118interrupt, clock event device notification service and support for suspend and
119resume.
120
121The framework adds about 700 lines of code which results in a 2KB increase of
122the kernel binary size. The conversion of i386 removes about 100 lines of
123code. The binary size decrease is in the range of 400 byte. We believe that the
124increase of flexibility and the avoidance of duplicated code across
125architectures justifies the slight increase of the binary size.
126
127The conversion of an architecture has no functional impact, but allows to
128utilize the high resolution and dynamic tick functionalites without any change
129to the clock event device and timer interrupt code. After the conversion the
130enabling of high resolution timers and dynamic ticks is simply provided by
131adding the kernel/time/Kconfig file to the architecture specific Kconfig and
132adding the dynamic tick specific calls to the idle routine (a total of 3 lines
133added to the idle function and the Kconfig file)
134
135Figure #4 (OLS slides p.20) illustrates the transformation.
136
137
138high resolution timer functionality
139-----------------------------------
140
141During system boot it is not possible to use the high resolution timer
142functionality, while making it possible would be difficult and would serve no
143useful function. The initialization of the clock event device framework, the
144clock source framework (GTOD) and hrtimers itself has to be done and
145appropriate clock sources and clock event devices have to be registered before
146the high resolution functionality can work. Up to the point where hrtimers are
147initialized, the system works in the usual low resolution periodic mode. The
148clock source and the clock event device layers provide notification functions
149which inform hrtimers about availability of new hardware. hrtimers validates
150the usability of the registered clock sources and clock event devices before
151switching to high resolution mode. This ensures also that a kernel which is
152configured for high resolution timers can run on a system which lacks the
153necessary hardware support.
154
155The high resolution timer code does not support SMP machines which have only
156global clock event devices. The support of such hardware would involve IPI
157calls when an interrupt happens. The overhead would be much larger than the
158benefit. This is the reason why we currently disable high resolution and
159dynamic ticks on i386 SMP systems which stop the local APIC in C3 power
160state. A workaround is available as an idea, but the problem has not been
161tackled yet.
162
163The time ordered insertion of timers provides all the infrastructure to decide
164whether the event device has to be reprogrammed when a timer is added. The
165decision is made per timer base and synchronized across per-cpu timer bases in
166a support function. The design allows the system to utilize separate per-CPU
167clock event devices for the per-CPU timer bases, but currently only one
168reprogrammable clock event device per-CPU is utilized.
169
170When the timer interrupt happens, the next event interrupt handler is called
171from the clock event distribution code and moves expired timers from the
172red-black tree to a separate double linked list and invokes the softirq
173handler. An additional mode field in the hrtimer structure allows the system to
174execute callback functions directly from the next event interrupt handler. This
175is restricted to code which can safely be executed in the hard interrupt
176context. This applies, for example, to the common case of a wakeup function as
177used by nanosleep. The advantage of executing the handler in the interrupt
178context is the avoidance of up to two context switches - from the interrupted
179context to the softirq and to the task which is woken up by the expired
180timer.
181
182Once a system has switched to high resolution mode, the periodic tick is
183switched off. This disables the per system global periodic clock event device -
184e.g. the PIT on i386 SMP systems.
185
186The periodic tick functionality is provided by an per-cpu hrtimer. The callback
187function is executed in the next event interrupt context and updates jiffies
188and calls update_process_times and profiling. The implementation of the hrtimer
189based periodic tick is designed to be extended with dynamic tick functionality.
190This allows to use a single clock event device to schedule high resolution
191timer and periodic events (jiffies tick, profiling, process accounting) on UP
192systems. This has been proved to work with the PIT on i386 and the Incrementer
193on PPC.
194
195The softirq for running the hrtimer queues and executing the callbacks has been
196separated from the tick bound timer softirq to allow accurate delivery of high
197resolution timer signals which are used by itimer and POSIX interval
198timers. The execution of this softirq can still be delayed by other softirqs,
199but the overall latencies have been significantly improved by this separation.
200
201Figure #5 (OLS slides p.22) illustrates the transformation.
202
203
204dynamic ticks
205-------------
206
207Dynamic ticks are the logical consequence of the hrtimer based periodic tick
208replacement (sched_tick). The functionality of the sched_tick hrtimer is
209extended by three functions:
210
211- hrtimer_stop_sched_tick
212- hrtimer_restart_sched_tick
213- hrtimer_update_jiffies
214
215hrtimer_stop_sched_tick() is called when a CPU goes into idle state. The code
216evaluates the next scheduled timer event (from both hrtimers and the timer
217wheel) and in case that the next event is further away than the next tick it
218reprograms the sched_tick to this future event, to allow longer idle sleeps
219without worthless interruption by the periodic tick. The function is also
220called when an interrupt happens during the idle period, which does not cause a
221reschedule. The call is necessary as the interrupt handler might have armed a
222new timer whose expiry time is before the time which was identified as the
223nearest event in the previous call to hrtimer_stop_sched_tick.
224
225hrtimer_restart_sched_tick() is called when the CPU leaves the idle state before
226it calls schedule(). hrtimer_restart_sched_tick() resumes the periodic tick,
227which is kept active until the next call to hrtimer_stop_sched_tick().
228
229hrtimer_update_jiffies() is called from irq_enter() when an interrupt happens
230in the idle period to make sure that jiffies are up to date and the interrupt
231handler has not to deal with an eventually stale jiffy value.
232
233The dynamic tick feature provides statistical values which are exported to
234userspace via /proc/stats and can be made available for enhanced power
235management control.
236
237The implementation leaves room for further development like full tickless
238systems, where the time slice is controlled by the scheduler, variable
239frequency profiling, and a complete removal of jiffies in the future.
240
241
242Aside the current initial submission of i386 support, the patchset has been
243extended to x86_64 and ARM already. Initial (work in progress) support is also
244available for MIPS and PowerPC.
245
246 Thomas, Ingo
247
248
249
diff --git a/Documentation/hrtimers.txt b/Documentation/hrtimers/hrtimers.txt
index ce31f65e12e7..ce31f65e12e7 100644
--- a/Documentation/hrtimers.txt
+++ b/Documentation/hrtimers/hrtimers.txt
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 22b19962a1a2..abd575cfc759 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -609,6 +609,10 @@ and is between 256 and 4096 characters. It is defined in the file
609 highmem otherwise. This also works to reduce highmem 609 highmem otherwise. This also works to reduce highmem
610 size on bigger boxes. 610 size on bigger boxes.
611 611
612 highres= [KNL] Enable/disable high resolution timer mode.
613 Valid parameters: "on", "off"
614 Default: "on"
615
612 hisax= [HW,ISDN] 616 hisax= [HW,ISDN]
613 See Documentation/isdn/README.HiSax. 617 See Documentation/isdn/README.HiSax.
614 618
@@ -1078,6 +1082,10 @@ and is between 256 and 4096 characters. It is defined in the file
1078 in certain environments such as networked servers or 1082 in certain environments such as networked servers or
1079 real-time systems. 1083 real-time systems.
1080 1084
1085 nohz= [KNL] Boottime enable/disable dynamic ticks
1086 Valid arguments: on, off
1087 Default: on
1088
1081 noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing 1089 noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing
1082 1090
1083 noirqdebug [IA-32] Disables the code which attempts to detect and 1091 noirqdebug [IA-32] Disables the code which attempts to detect and
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index ec01f08f5642..e101846ab7dd 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -159,8 +159,7 @@ void __init init_IRQ(void)
159 int irq; 159 int irq;
160 160
161 for (irq = 0; irq < NR_IRQS; irq++) 161 for (irq = 0; irq < NR_IRQS; irq++)
162 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_DELAYED_DISABLE | 162 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
163 IRQ_NOPROBE;
164 163
165#ifdef CONFIG_SMP 164#ifdef CONFIG_SMP
166 bad_irq_desc.affinity = CPU_MASK_ALL; 165 bad_irq_desc.affinity = CPU_MASK_ALL;
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index 40039b2a90b3..2703a730baf7 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -87,7 +87,7 @@ static struct clocksource clocksource_imx = {
87 .read = imx_get_cycles, 87 .read = imx_get_cycles,
88 .mask = 0xFFFFFFFF, 88 .mask = 0xFFFFFFFF,
89 .shift = 20, 89 .shift = 20,
90 .is_continuous = 1, 90 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
91}; 91};
92 92
93static int __init imx_clocksource_init(void) 93static int __init imx_clocksource_init(void)
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 2ec9a9e9a04d..45068c3d8dcc 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -395,7 +395,7 @@ static struct clocksource clocksource_ixp4xx = {
395 .read = ixp4xx_get_cycles, 395 .read = ixp4xx_get_cycles,
396 .mask = CLOCKSOURCE_MASK(32), 396 .mask = CLOCKSOURCE_MASK(32),
397 .shift = 20, 397 .shift = 20,
398 .is_continuous = 1, 398 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
399}; 399};
400 400
401unsigned long ixp4xx_timer_freq = FREQ; 401unsigned long ixp4xx_timer_freq = FREQ;
diff --git a/arch/arm/mach-netx/time.c b/arch/arm/mach-netx/time.c
index 5773b55ef4a6..7e132fcccd47 100644
--- a/arch/arm/mach-netx/time.c
+++ b/arch/arm/mach-netx/time.c
@@ -62,7 +62,7 @@ static struct clocksource clocksource_netx = {
62 .read = netx_get_cycles, 62 .read = netx_get_cycles,
63 .mask = CLOCKSOURCE_MASK(32), 63 .mask = CLOCKSOURCE_MASK(32),
64 .shift = 20, 64 .shift = 20,
65 .is_continuous = 1, 65 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
66}; 66};
67 67
68/* 68/*
diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c
index ee2beb400414..fc3b82a740a0 100644
--- a/arch/arm/mach-pxa/time.c
+++ b/arch/arm/mach-pxa/time.c
@@ -112,7 +112,7 @@ static struct clocksource clocksource_pxa = {
112 .read = pxa_get_cycles, 112 .read = pxa_get_cycles,
113 .mask = CLOCKSOURCE_MASK(32), 113 .mask = CLOCKSOURCE_MASK(32),
114 .shift = 20, 114 .shift = 20,
115 .is_continuous = 1, 115 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
116}; 116};
117 117
118static void __init pxa_timer_init(void) 118static void __init pxa_timer_init(void)
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index a2f74affaa98..c10833f2ee0c 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -37,7 +37,7 @@ static struct clocksource clocksource_avr32 = {
37 .read = read_cycle_count, 37 .read = read_cycle_count,
38 .mask = CLOCKSOURCE_MASK(32), 38 .mask = CLOCKSOURCE_MASK(32),
39 .shift = 16, 39 .shift = 16,
40 .is_continuous = 1, 40 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
41}; 41};
42 42
43/* 43/*
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 595fb771366e..1df4a1f14289 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -18,6 +18,18 @@ config GENERIC_TIME
18 bool 18 bool
19 default y 19 default y
20 20
21config CLOCKSOURCE_WATCHDOG
22 bool
23 default y
24
25config GENERIC_CLOCKEVENTS
26 bool
27 default y
28
29config GENERIC_CLOCKEVENTS_BROADCAST
30 bool
31 default y
32
21config LOCKDEP_SUPPORT 33config LOCKDEP_SUPPORT
22 bool 34 bool
23 default y 35 default y
@@ -74,6 +86,8 @@ source "init/Kconfig"
74 86
75menu "Processor type and features" 87menu "Processor type and features"
76 88
89source "kernel/time/Kconfig"
90
77config SMP 91config SMP
78 bool "Symmetric multi-processing support" 92 bool "Symmetric multi-processing support"
79 ---help--- 93 ---help---
@@ -205,7 +219,7 @@ config PARAVIRT
205 219
206config VMI 220config VMI
207 bool "VMI Paravirt-ops support" 221 bool "VMI Paravirt-ops support"
208 depends on PARAVIRT 222 depends on PARAVIRT && !NO_HZ
209 default y 223 default y
210 help 224 help
211 VMI provides a paravirtualized interface to multiple hypervisors 225 VMI provides a paravirtualized interface to multiple hypervisors
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index cbe4e601885c..4ae3dcf1d2f0 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o 18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_MICROCODE) += microcode.o 19obj-$(CONFIG_MICROCODE) += microcode.o
20obj-$(CONFIG_APM) += apm.o 20obj-$(CONFIG_APM) += apm.o
21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o 21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o
22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
23obj-$(CONFIG_X86_MPPARSE) += mpparse.o 23obj-$(CONFIG_X86_MPPARSE) += mpparse.o
24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
@@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
32obj-$(CONFIG_MODULES) += module.o 32obj-$(CONFIG_MODULES) += module.o
33obj-y += sysenter.o vsyscall.o 33obj-y += sysenter.o vsyscall.o
34obj-$(CONFIG_ACPI_SRAT) += srat.o 34obj-$(CONFIG_ACPI_SRAT) += srat.o
35obj-$(CONFIG_HPET_TIMER) += time_hpet.o
36obj-$(CONFIG_EFI) += efi.o efi_stub.o 35obj-$(CONFIG_EFI) += efi.o efi_stub.o
37obj-$(CONFIG_DOUBLEFAULT) += doublefault.o 36obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
38obj-$(CONFIG_VM86) += vm86.o 37obj-$(CONFIG_VM86) += vm86.o
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index e94aff6888ca..fb3e72328a5a 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/acpi.h> 27#include <linux/acpi.h>
28#include <linux/acpi_pmtmr.h>
28#include <linux/efi.h> 29#include <linux/efi.h>
29#include <linux/cpumask.h> 30#include <linux/cpumask.h>
30#include <linux/module.h> 31#include <linux/module.h>
@@ -615,6 +616,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
615} 616}
616 617
617#ifdef CONFIG_HPET_TIMER 618#ifdef CONFIG_HPET_TIMER
619#include <asm/hpet.h>
618 620
619static int __init acpi_parse_hpet(struct acpi_table_header *table) 621static int __init acpi_parse_hpet(struct acpi_table_header *table)
620{ 622{
@@ -645,24 +647,11 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
645 hpet_res->end = (1 * 1024) - 1; 647 hpet_res->end = (1 * 1024) - 1;
646 } 648 }
647 649
648#ifdef CONFIG_X86_64 650 hpet_address = hpet_tbl->address.address;
649 vxtime.hpet_address = hpet_tbl->address.address;
650
651 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", 651 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
652 hpet_tbl->id, vxtime.hpet_address); 652 hpet_tbl->id, hpet_address);
653
654 res_start = vxtime.hpet_address;
655#else /* X86 */
656 {
657 extern unsigned long hpet_address;
658
659 hpet_address = hpet_tbl->address.address;
660 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
661 hpet_tbl->id, hpet_address);
662 653
663 res_start = hpet_address; 654 res_start = hpet_address;
664 }
665#endif /* X86 */
666 655
667 if (hpet_res) { 656 if (hpet_res) {
668 hpet_res->start = res_start; 657 hpet_res->start = res_start;
@@ -676,10 +665,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
676#define acpi_parse_hpet NULL 665#define acpi_parse_hpet NULL
677#endif 666#endif
678 667
679#ifdef CONFIG_X86_PM_TIMER
680extern u32 pmtmr_ioport;
681#endif
682
683static int __init acpi_parse_fadt(struct acpi_table_header *table) 668static int __init acpi_parse_fadt(struct acpi_table_header *table)
684{ 669{
685 670
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index f4159e0a7ae9..9655c233e6f1 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -25,6 +25,8 @@
25#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/cpu.h> 27#include <linux/cpu.h>
28#include <linux/clockchips.h>
29#include <linux/acpi_pmtmr.h>
28#include <linux/module.h> 30#include <linux/module.h>
29 31
30#include <asm/atomic.h> 32#include <asm/atomic.h>
@@ -45,128 +47,549 @@
45#include "io_ports.h" 47#include "io_ports.h"
46 48
47/* 49/*
48 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as 50 * Sanity check
49 * IPIs in place of local APIC timers
50 */ 51 */
51static cpumask_t timer_bcast_ipi; 52#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
53# error SPURIOUS_APIC_VECTOR definition error
54#endif
52 55
53/* 56/*
54 * Knob to control our willingness to enable the local APIC. 57 * Knob to control our willingness to enable the local APIC.
58 *
59 * -1=force-disable, +1=force-enable
55 */ 60 */
56static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ 61static int enable_local_apic __initdata = 0;
57
58static inline void lapic_disable(void)
59{
60 enable_local_apic = -1;
61 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
62}
63 62
64static inline void lapic_enable(void) 63/* Local APIC timer verification ok */
65{ 64static int local_apic_timer_verify_ok;
66 enable_local_apic = 1;
67}
68 65
69/* 66/*
70 * Debug level 67 * Debug level, exported for io_apic.c
71 */ 68 */
72int apic_verbosity; 69int apic_verbosity;
73 70
71static unsigned int calibration_result;
74 72
73static int lapic_next_event(unsigned long delta,
74 struct clock_event_device *evt);
75static void lapic_timer_setup(enum clock_event_mode mode,
76 struct clock_event_device *evt);
77static void lapic_timer_broadcast(cpumask_t mask);
75static void apic_pm_activate(void); 78static void apic_pm_activate(void);
76 79
80/*
81 * The local apic timer can be used for any function which is CPU local.
82 */
83static struct clock_event_device lapic_clockevent = {
84 .name = "lapic",
85 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
86 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
87 .shift = 32,
88 .set_mode = lapic_timer_setup,
89 .set_next_event = lapic_next_event,
90 .broadcast = lapic_timer_broadcast,
91 .rating = 100,
92 .irq = -1,
93};
94static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
95
96/* Local APIC was disabled by the BIOS and enabled by the kernel */
97static int enabled_via_apicbase;
98
99/*
100 * Get the LAPIC version
101 */
102static inline int lapic_get_version(void)
103{
104 return GET_APIC_VERSION(apic_read(APIC_LVR));
105}
106
107/*
108 * Check, if the APIC is integrated or a seperate chip
109 */
110static inline int lapic_is_integrated(void)
111{
112 return APIC_INTEGRATED(lapic_get_version());
113}
114
115/*
116 * Check, whether this is a modern or a first generation APIC
117 */
77static int modern_apic(void) 118static int modern_apic(void)
78{ 119{
79 unsigned int lvr, version;
80 /* AMD systems use old APIC versions, so check the CPU */ 120 /* AMD systems use old APIC versions, so check the CPU */
81 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 121 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
82 boot_cpu_data.x86 >= 0xf) 122 boot_cpu_data.x86 >= 0xf)
83 return 1; 123 return 1;
84 lvr = apic_read(APIC_LVR); 124 return lapic_get_version() >= 0x14;
85 version = GET_APIC_VERSION(lvr); 125}
86 return version >= 0x14; 126
127/**
128 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
129 */
130void enable_NMI_through_LVT0 (void * dummy)
131{
132 unsigned int v = APIC_DM_NMI;
133
134 /* Level triggered for 82489DX */
135 if (!lapic_is_integrated())
136 v |= APIC_LVT_LEVEL_TRIGGER;
137 apic_write_around(APIC_LVT0, v);
138}
139
140/**
141 * get_physical_broadcast - Get number of physical broadcast IDs
142 */
143int get_physical_broadcast(void)
144{
145 return modern_apic() ? 0xff : 0xf;
146}
147
148/**
149 * lapic_get_maxlvt - get the maximum number of local vector table entries
150 */
151int lapic_get_maxlvt(void)
152{
153 unsigned int v = apic_read(APIC_LVR);
154
155 /* 82489DXs do not report # of LVT entries. */
156 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
87} 157}
88 158
89/* 159/*
90 * 'what should we do if we get a hw irq event on an illegal vector'. 160 * Local APIC timer
91 * each architecture has to answer this themselves.
92 */ 161 */
93void ack_bad_irq(unsigned int irq) 162
163/* Clock divisor is set to 16 */
164#define APIC_DIVISOR 16
165
166/*
167 * This function sets up the local APIC timer, with a timeout of
168 * 'clocks' APIC bus clock. During calibration we actually call
169 * this function twice on the boot CPU, once with a bogus timeout
170 * value, second time for real. The other (noncalibrating) CPUs
171 * call this function only once, with the real, calibrated value.
172 *
173 * We do reads before writes even if unnecessary, to get around the
174 * P5 APIC double write bug.
175 */
176static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
94{ 177{
95 printk("unexpected IRQ trap at vector %02x\n", irq); 178 unsigned int lvtt_value, tmp_value;
179
180 lvtt_value = LOCAL_TIMER_VECTOR;
181 if (!oneshot)
182 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
183 if (!lapic_is_integrated())
184 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
185
186 if (!irqen)
187 lvtt_value |= APIC_LVT_MASKED;
188
189 apic_write_around(APIC_LVTT, lvtt_value);
190
96 /* 191 /*
97 * Currently unexpected vectors happen only on SMP and APIC. 192 * Divide PICLK by 16
98 * We _must_ ack these because every local APIC has only N
99 * irq slots per priority level, and a 'hanging, unacked' IRQ
100 * holds up an irq slot - in excessive cases (when multiple
101 * unexpected vectors occur) that might lock up the APIC
102 * completely.
103 * But only ack when the APIC is enabled -AK
104 */ 193 */
105 if (cpu_has_apic) 194 tmp_value = apic_read(APIC_TDCR);
106 ack_APIC_irq(); 195 apic_write_around(APIC_TDCR, (tmp_value
196 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
197 | APIC_TDR_DIV_16);
198
199 if (!oneshot)
200 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
107} 201}
108 202
109void __init apic_intr_init(void) 203/*
204 * Program the next event, relative to now
205 */
206static int lapic_next_event(unsigned long delta,
207 struct clock_event_device *evt)
208{
209 apic_write_around(APIC_TMICT, delta);
210 return 0;
211}
212
213/*
214 * Setup the lapic timer in periodic or oneshot mode
215 */
216static void lapic_timer_setup(enum clock_event_mode mode,
217 struct clock_event_device *evt)
218{
219 unsigned long flags;
220 unsigned int v;
221
222 /* Lapic used for broadcast ? */
223 if (!local_apic_timer_verify_ok)
224 return;
225
226 local_irq_save(flags);
227
228 switch (mode) {
229 case CLOCK_EVT_MODE_PERIODIC:
230 case CLOCK_EVT_MODE_ONESHOT:
231 __setup_APIC_LVTT(calibration_result,
232 mode != CLOCK_EVT_MODE_PERIODIC, 1);
233 break;
234 case CLOCK_EVT_MODE_UNUSED:
235 case CLOCK_EVT_MODE_SHUTDOWN:
236 v = apic_read(APIC_LVTT);
237 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
238 apic_write_around(APIC_LVTT, v);
239 break;
240 }
241
242 local_irq_restore(flags);
243}
244
245/*
246 * Local APIC timer broadcast function
247 */
248static void lapic_timer_broadcast(cpumask_t mask)
110{ 249{
111#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
112 smp_intr_init(); 251 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
113#endif 252#endif
114 /* self generated IPI for local APIC timer */ 253}
115 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
116 254
117 /* IPI vectors for APIC spurious and error interrupts */ 255/*
118 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 256 * Setup the local APIC timer for this CPU. Copy the initilized values
119 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 257 * of the boot CPU and register the clock event in the framework.
258 */
259static void __devinit setup_APIC_timer(void)
260{
261 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
120 262
121 /* thermal monitor LVT interrupt */ 263 memcpy(levt, &lapic_clockevent, sizeof(*levt));
122#ifdef CONFIG_X86_MCE_P4THERMAL 264 levt->cpumask = cpumask_of_cpu(smp_processor_id());
123 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 265
124#endif 266 clockevents_register_device(levt);
125} 267}
126 268
127/* Using APIC to generate smp_local_timer_interrupt? */ 269/*
128int using_apic_timer __read_mostly = 0; 270 * In this functions we calibrate APIC bus clocks to the external timer.
271 *
272 * We want to do the calibration only once since we want to have local timer
273 * irqs syncron. CPUs connected by the same APIC bus have the very same bus
274 * frequency.
275 *
276 * This was previously done by reading the PIT/HPET and waiting for a wrap
277 * around to find out, that a tick has elapsed. I have a box, where the PIT
278 * readout is broken, so it never gets out of the wait loop again. This was
279 * also reported by others.
280 *
281 * Monitoring the jiffies value is inaccurate and the clockevents
282 * infrastructure allows us to do a simple substitution of the interrupt
283 * handler.
284 *
285 * The calibration routine also uses the pm_timer when possible, as the PIT
286 * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
287 * back to normal later in the boot process).
288 */
129 289
130static int enabled_via_apicbase; 290#define LAPIC_CAL_LOOPS (HZ/10)
131 291
132void enable_NMI_through_LVT0 (void * dummy) 292static __initdata volatile int lapic_cal_loops = -1;
293static __initdata long lapic_cal_t1, lapic_cal_t2;
294static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
295static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
296static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
297
298/*
299 * Temporary interrupt handler.
300 */
301static void __init lapic_cal_handler(struct clock_event_device *dev)
133{ 302{
134 unsigned int v, ver; 303 unsigned long long tsc = 0;
304 long tapic = apic_read(APIC_TMCCT);
305 unsigned long pm = acpi_pm_read_early();
135 306
136 ver = apic_read(APIC_LVR); 307 if (cpu_has_tsc)
137 ver = GET_APIC_VERSION(ver); 308 rdtscll(tsc);
138 v = APIC_DM_NMI; /* unmask and set to NMI */ 309
139 if (!APIC_INTEGRATED(ver)) /* 82489DX */ 310 switch (lapic_cal_loops++) {
140 v |= APIC_LVT_LEVEL_TRIGGER; 311 case 0:
141 apic_write_around(APIC_LVT0, v); 312 lapic_cal_t1 = tapic;
313 lapic_cal_tsc1 = tsc;
314 lapic_cal_pm1 = pm;
315 lapic_cal_j1 = jiffies;
316 break;
317
318 case LAPIC_CAL_LOOPS:
319 lapic_cal_t2 = tapic;
320 lapic_cal_tsc2 = tsc;
321 if (pm < lapic_cal_pm1)
322 pm += ACPI_PM_OVRRUN;
323 lapic_cal_pm2 = pm;
324 lapic_cal_j2 = jiffies;
325 break;
326 }
142} 327}
143 328
144int get_physical_broadcast(void) 329/*
330 * Setup the boot APIC
331 *
332 * Calibrate and verify the result.
333 */
334void __init setup_boot_APIC_clock(void)
145{ 335{
146 if (modern_apic()) 336 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
147 return 0xff; 337 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
148 else 338 const long pm_thresh = pm_100ms/100;
149 return 0xf; 339 void (*real_handler)(struct clock_event_device *dev);
340 unsigned long deltaj;
341 long delta, deltapm;
342
343 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
344 "calibrating APIC timer ...\n");
345
346 local_irq_disable();
347
348 /* Replace the global interrupt handler */
349 real_handler = global_clock_event->event_handler;
350 global_clock_event->event_handler = lapic_cal_handler;
351
352 /*
353 * Setup the APIC counter to 1e9. There is no way the lapic
354 * can underflow in the 100ms detection time frame
355 */
356 __setup_APIC_LVTT(1000000000, 0, 0);
357
358 /* Let the interrupts run */
359 local_irq_enable();
360
361 while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
362
363 local_irq_disable();
364
365 /* Restore the real event handler */
366 global_clock_event->event_handler = real_handler;
367
368 /* Build delta t1-t2 as apic timer counts down */
369 delta = lapic_cal_t1 - lapic_cal_t2;
370 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
371
372 /* Check, if the PM timer is available */
373 deltapm = lapic_cal_pm2 - lapic_cal_pm1;
374 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
375
376 if (deltapm) {
377 unsigned long mult;
378 u64 res;
379
380 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
381
382 if (deltapm > (pm_100ms - pm_thresh) &&
383 deltapm < (pm_100ms + pm_thresh)) {
384 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
385 } else {
386 res = (((u64) deltapm) * mult) >> 22;
387 do_div(res, 1000000);
388 printk(KERN_WARNING "APIC calibration not consistent "
389 "with PM Timer: %ldms instead of 100ms\n",
390 (long)res);
391 /* Correct the lapic counter value */
392 res = (((u64) delta ) * pm_100ms);
393 do_div(res, deltapm);
394 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
395 "%lu (%ld)\n", (unsigned long) res, delta);
396 delta = (long) res;
397 }
398 }
399
400 /* Calculate the scaled math multiplication factor */
401 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
402 lapic_clockevent.max_delta_ns =
403 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
404 lapic_clockevent.min_delta_ns =
405 clockevent_delta2ns(0xF, &lapic_clockevent);
406
407 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
408
409 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
410 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
411 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
412 calibration_result);
413
414 if (cpu_has_tsc) {
415 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
416 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
417 "%ld.%04ld MHz.\n",
418 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
419 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
420 }
421
422 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
423 "%u.%04u MHz.\n",
424 calibration_result / (1000000 / HZ),
425 calibration_result % (1000000 / HZ));
426
427
428 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
429
430 /*
431 * Setup the apic timer manually
432 */
433 local_apic_timer_verify_ok = 1;
434 levt->event_handler = lapic_cal_handler;
435 lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
436 lapic_cal_loops = -1;
437
438 /* Let the interrupts run */
439 local_irq_enable();
440
441 while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
442
443 local_irq_disable();
444
445 /* Stop the lapic timer */
446 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
447
448 local_irq_enable();
449
450 /* Jiffies delta */
451 deltaj = lapic_cal_j2 - lapic_cal_j1;
452 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
453
454 /* Check, if the PM timer is available */
455 deltapm = lapic_cal_pm2 - lapic_cal_pm1;
456 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
457
458 local_apic_timer_verify_ok = 0;
459
460 if (deltapm) {
461 if (deltapm > (pm_100ms - pm_thresh) &&
462 deltapm < (pm_100ms + pm_thresh)) {
463 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
464 /* Check, if the jiffies result is consistent */
465 if (deltaj < LAPIC_CAL_LOOPS-2 ||
466 deltaj > LAPIC_CAL_LOOPS+2) {
467 /*
468 * Not sure, what we can do about this one.
469 * When high resultion timers are active
470 * and the lapic timer does not stop in C3
471 * we are fine. Otherwise more trouble might
472 * be waiting. -- tglx
473 */
474 printk(KERN_WARNING "Global event device %s "
475 "has wrong frequency "
476 "(%lu ticks instead of %d)\n",
477 global_clock_event->name, deltaj,
478 LAPIC_CAL_LOOPS);
479 }
480 local_apic_timer_verify_ok = 1;
481 }
482 } else {
483 /* Check, if the jiffies result is consistent */
484 if (deltaj >= LAPIC_CAL_LOOPS-2 &&
485 deltaj <= LAPIC_CAL_LOOPS+2) {
486 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
487 local_apic_timer_verify_ok = 1;
488 }
489 }
490
491 if (!local_apic_timer_verify_ok) {
492 printk(KERN_WARNING
493 "APIC timer disabled due to verification failure.\n");
494 /* No broadcast on UP ! */
495 if (num_possible_cpus() == 1)
496 return;
497 } else
498 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
499
500 /* Setup the lapic or request the broadcast */
501 setup_APIC_timer();
502}
503
504void __devinit setup_secondary_APIC_clock(void)
505{
506 setup_APIC_timer();
150} 507}
151 508
152int get_maxlvt(void) 509/*
510 * The guts of the apic timer interrupt
511 */
512static void local_apic_timer_interrupt(void)
153{ 513{
154 unsigned int v, ver, maxlvt; 514 int cpu = smp_processor_id();
515 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
155 516
156 v = apic_read(APIC_LVR); 517 /*
157 ver = GET_APIC_VERSION(v); 518 * Normally we should not be here till LAPIC has been initialized but
158 /* 82489DXs do not report # of LVT entries. */ 519 * in some cases like kdump, its possible that there is a pending LAPIC
159 maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; 520 * timer interrupt from previous kernel's context and is delivered in
160 return maxlvt; 521 * new kernel the moment interrupts are enabled.
522 *
523 * Interrupts are enabled early and LAPIC is setup much later, hence
524 * its possible that when we get here evt->event_handler is NULL.
525 * Check for event_handler being NULL and discard the interrupt as
526 * spurious.
527 */
528 if (!evt->event_handler) {
529 printk(KERN_WARNING
530 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
531 /* Switch it off */
532 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
533 return;
534 }
535
536 per_cpu(irq_stat, cpu).apic_timer_irqs++;
537
538 evt->event_handler(evt);
539}
540
541/*
542 * Local APIC timer interrupt. This is the most natural way for doing
543 * local interrupts, but local timer interrupts can be emulated by
544 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
545 *
546 * [ if a single-CPU system runs an SMP kernel then we call the local
547 * interrupt as well. Thus we cannot inline the local irq ... ]
548 */
549
550void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
551{
552 struct pt_regs *old_regs = set_irq_regs(regs);
553
554 /*
555 * NOTE! We'd better ACK the irq immediately,
556 * because timer handling can be slow.
557 */
558 ack_APIC_irq();
559 /*
560 * update_process_times() expects us to have done irq_enter().
561 * Besides, if we don't timer interrupts ignore the global
562 * interrupt lock, which is the WrongThing (tm) to do.
563 */
564 exit_idle();
565 irq_enter();
566 local_apic_timer_interrupt();
567 irq_exit();
568
569 set_irq_regs(old_regs);
161} 570}
162 571
572int setup_profiling_timer(unsigned int multiplier)
573{
574 return -EINVAL;
575}
576
577/*
578 * Local APIC start and shutdown
579 */
580
581/**
582 * clear_local_APIC - shutdown the local APIC
583 *
584 * This is called, when a CPU is disabled and before rebooting, so the state of
585 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
586 * leftovers during boot.
587 */
163void clear_local_APIC(void) 588void clear_local_APIC(void)
164{ 589{
165 int maxlvt; 590 int maxlvt = lapic_get_maxlvt();
166 unsigned long v; 591 unsigned long v;
167 592
168 maxlvt = get_maxlvt();
169
170 /* 593 /*
171 * Masking an LVT entry can trigger a local APIC error 594 * Masking an LVT entry can trigger a local APIC error
172 * if the vector is zero. Mask LVTERR first to prevent this. 595 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -190,7 +613,7 @@ void clear_local_APIC(void)
190 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 613 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
191 } 614 }
192 615
193/* lets not touch this if we didn't frob it */ 616 /* lets not touch this if we didn't frob it */
194#ifdef CONFIG_X86_MCE_P4THERMAL 617#ifdef CONFIG_X86_MCE_P4THERMAL
195 if (maxlvt >= 5) { 618 if (maxlvt >= 5) {
196 v = apic_read(APIC_LVTTHMR); 619 v = apic_read(APIC_LVTTHMR);
@@ -212,85 +635,18 @@ void clear_local_APIC(void)
212 if (maxlvt >= 5) 635 if (maxlvt >= 5)
213 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 636 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
214#endif 637#endif
215 v = GET_APIC_VERSION(apic_read(APIC_LVR)); 638 /* Integrated APIC (!82489DX) ? */
216 if (APIC_INTEGRATED(v)) { /* !82489DX */ 639 if (lapic_is_integrated()) {
217 if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ 640 if (maxlvt > 3)
641 /* Clear ESR due to Pentium errata 3AP and 11AP */
218 apic_write(APIC_ESR, 0); 642 apic_write(APIC_ESR, 0);
219 apic_read(APIC_ESR); 643 apic_read(APIC_ESR);
220 } 644 }
221} 645}
222 646
223void __init connect_bsp_APIC(void) 647/**
224{ 648 * disable_local_APIC - clear and disable the local APIC
225 if (pic_mode) { 649 */
226 /*
227 * Do not trust the local APIC being empty at bootup.
228 */
229 clear_local_APIC();
230 /*
231 * PIC mode, enable APIC mode in the IMCR, i.e.
232 * connect BSP's local APIC to INT and NMI lines.
233 */
234 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
235 "enabling APIC mode.\n");
236 outb(0x70, 0x22);
237 outb(0x01, 0x23);
238 }
239 enable_apic_mode();
240}
241
242void disconnect_bsp_APIC(int virt_wire_setup)
243{
244 if (pic_mode) {
245 /*
246 * Put the board back into PIC mode (has an effect
247 * only on certain older boards). Note that APIC
248 * interrupts, including IPIs, won't work beyond
249 * this point! The only exception are INIT IPIs.
250 */
251 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
252 "entering PIC mode.\n");
253 outb(0x70, 0x22);
254 outb(0x00, 0x23);
255 }
256 else {
257 /* Go back to Virtual Wire compatibility mode */
258 unsigned long value;
259
260 /* For the spurious interrupt use vector F, and enable it */
261 value = apic_read(APIC_SPIV);
262 value &= ~APIC_VECTOR_MASK;
263 value |= APIC_SPIV_APIC_ENABLED;
264 value |= 0xf;
265 apic_write_around(APIC_SPIV, value);
266
267 if (!virt_wire_setup) {
268 /* For LVT0 make it edge triggered, active high, external and enabled */
269 value = apic_read(APIC_LVT0);
270 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
271 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
272 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
273 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
274 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
275 apic_write_around(APIC_LVT0, value);
276 }
277 else {
278 /* Disable LVT0 */
279 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
280 }
281
282 /* For LVT1 make it edge triggered, active high, nmi and enabled */
283 value = apic_read(APIC_LVT1);
284 value &= ~(
285 APIC_MODE_MASK | APIC_SEND_PENDING |
286 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
287 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
288 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
289 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
290 apic_write_around(APIC_LVT1, value);
291 }
292}
293
294void disable_local_APIC(void) 650void disable_local_APIC(void)
295{ 651{
296 unsigned long value; 652 unsigned long value;
@@ -305,8 +661,13 @@ void disable_local_APIC(void)
305 value &= ~APIC_SPIV_APIC_ENABLED; 661 value &= ~APIC_SPIV_APIC_ENABLED;
306 apic_write_around(APIC_SPIV, value); 662 apic_write_around(APIC_SPIV, value);
307 663
664 /*
665 * When LAPIC was disabled by the BIOS and enabled by the kernel,
666 * restore the disabled state.
667 */
308 if (enabled_via_apicbase) { 668 if (enabled_via_apicbase) {
309 unsigned int l, h; 669 unsigned int l, h;
670
310 rdmsr(MSR_IA32_APICBASE, l, h); 671 rdmsr(MSR_IA32_APICBASE, l, h);
311 l &= ~MSR_IA32_APICBASE_ENABLE; 672 l &= ~MSR_IA32_APICBASE_ENABLE;
312 wrmsr(MSR_IA32_APICBASE, l, h); 673 wrmsr(MSR_IA32_APICBASE, l, h);
@@ -314,6 +675,28 @@ void disable_local_APIC(void)
314} 675}
315 676
316/* 677/*
678 * If Linux enabled the LAPIC against the BIOS default disable it down before
679 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
680 * not power-off. Additionally clear all LVT entries before disable_local_APIC
681 * for the case where Linux didn't enable the LAPIC.
682 */
683void lapic_shutdown(void)
684{
685 unsigned long flags;
686
687 if (!cpu_has_apic)
688 return;
689
690 local_irq_save(flags);
691 clear_local_APIC();
692
693 if (enabled_via_apicbase)
694 disable_local_APIC();
695
696 local_irq_restore(flags);
697}
698
699/*
317 * This is to verify that we're looking at a real local APIC. 700 * This is to verify that we're looking at a real local APIC.
318 * Check these against your board if the CPUs aren't getting 701 * Check these against your board if the CPUs aren't getting
319 * started for no apparent reason. 702 * started for no apparent reason.
@@ -345,7 +728,7 @@ int __init verify_local_APIC(void)
345 reg1 = GET_APIC_VERSION(reg0); 728 reg1 = GET_APIC_VERSION(reg0);
346 if (reg1 == 0x00 || reg1 == 0xff) 729 if (reg1 == 0x00 || reg1 == 0xff)
347 return 0; 730 return 0;
348 reg1 = get_maxlvt(); 731 reg1 = lapic_get_maxlvt();
349 if (reg1 < 0x02 || reg1 == 0xff) 732 if (reg1 < 0x02 || reg1 == 0xff)
350 return 0; 733 return 0;
351 734
@@ -368,10 +751,15 @@ int __init verify_local_APIC(void)
368 return 1; 751 return 1;
369} 752}
370 753
754/**
755 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
756 */
371void __init sync_Arb_IDs(void) 757void __init sync_Arb_IDs(void)
372{ 758{
373 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 759 /*
374 And not needed on AMD */ 760 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
761 * needed on AMD.
762 */
375 if (modern_apic()) 763 if (modern_apic())
376 return; 764 return;
377 /* 765 /*
@@ -384,14 +772,12 @@ void __init sync_Arb_IDs(void)
384 | APIC_DM_INIT); 772 | APIC_DM_INIT);
385} 773}
386 774
387extern void __error_in_apic_c (void);
388
389/* 775/*
390 * An initial setup of the virtual wire mode. 776 * An initial setup of the virtual wire mode.
391 */ 777 */
392void __init init_bsp_APIC(void) 778void __init init_bsp_APIC(void)
393{ 779{
394 unsigned long value, ver; 780 unsigned long value;
395 781
396 /* 782 /*
397 * Don't do the setup now if we have a SMP BIOS as the 783 * Don't do the setup now if we have a SMP BIOS as the
@@ -400,9 +786,6 @@ void __init init_bsp_APIC(void)
400 if (smp_found_config || !cpu_has_apic) 786 if (smp_found_config || !cpu_has_apic)
401 return; 787 return;
402 788
403 value = apic_read(APIC_LVR);
404 ver = GET_APIC_VERSION(value);
405
406 /* 789 /*
407 * Do not trust the local APIC being empty at bootup. 790 * Do not trust the local APIC being empty at bootup.
408 */ 791 */
@@ -414,9 +797,10 @@ void __init init_bsp_APIC(void)
414 value = apic_read(APIC_SPIV); 797 value = apic_read(APIC_SPIV);
415 value &= ~APIC_VECTOR_MASK; 798 value &= ~APIC_VECTOR_MASK;
416 value |= APIC_SPIV_APIC_ENABLED; 799 value |= APIC_SPIV_APIC_ENABLED;
417 800
418 /* This bit is reserved on P4/Xeon and should be cleared */ 801 /* This bit is reserved on P4/Xeon and should be cleared */
419 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) 802 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
803 (boot_cpu_data.x86 == 15))
420 value &= ~APIC_SPIV_FOCUS_DISABLED; 804 value &= ~APIC_SPIV_FOCUS_DISABLED;
421 else 805 else
422 value |= APIC_SPIV_FOCUS_DISABLED; 806 value |= APIC_SPIV_FOCUS_DISABLED;
@@ -428,14 +812,17 @@ void __init init_bsp_APIC(void)
428 */ 812 */
429 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 813 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
430 value = APIC_DM_NMI; 814 value = APIC_DM_NMI;
431 if (!APIC_INTEGRATED(ver)) /* 82489DX */ 815 if (!lapic_is_integrated()) /* 82489DX */
432 value |= APIC_LVT_LEVEL_TRIGGER; 816 value |= APIC_LVT_LEVEL_TRIGGER;
433 apic_write_around(APIC_LVT1, value); 817 apic_write_around(APIC_LVT1, value);
434} 818}
435 819
820/**
821 * setup_local_APIC - setup the local APIC
822 */
436void __devinit setup_local_APIC(void) 823void __devinit setup_local_APIC(void)
437{ 824{
438 unsigned long oldvalue, value, ver, maxlvt; 825 unsigned long oldvalue, value, maxlvt, integrated;
439 int i, j; 826 int i, j;
440 827
441 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 828 /* Pound the ESR really hard over the head with a big hammer - mbligh */
@@ -446,11 +833,7 @@ void __devinit setup_local_APIC(void)
446 apic_write(APIC_ESR, 0); 833 apic_write(APIC_ESR, 0);
447 } 834 }
448 835
449 value = apic_read(APIC_LVR); 836 integrated = lapic_is_integrated();
450 ver = GET_APIC_VERSION(value);
451
452 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
453 __error_in_apic_c();
454 837
455 /* 838 /*
456 * Double-check whether this APIC is really registered. 839 * Double-check whether this APIC is really registered.
@@ -521,13 +904,10 @@ void __devinit setup_local_APIC(void)
521 * like LRU than MRU (the short-term load is more even across CPUs). 904 * like LRU than MRU (the short-term load is more even across CPUs).
522 * See also the comment in end_level_ioapic_irq(). --macro 905 * See also the comment in end_level_ioapic_irq(). --macro
523 */ 906 */
524#if 1 907
525 /* Enable focus processor (bit==0) */ 908 /* Enable focus processor (bit==0) */
526 value &= ~APIC_SPIV_FOCUS_DISABLED; 909 value &= ~APIC_SPIV_FOCUS_DISABLED;
527#else 910
528 /* Disable focus processor (bit==1) */
529 value |= APIC_SPIV_FOCUS_DISABLED;
530#endif
531 /* 911 /*
532 * Set spurious IRQ vector 912 * Set spurious IRQ vector
533 */ 913 */
@@ -563,17 +943,18 @@ void __devinit setup_local_APIC(void)
563 value = APIC_DM_NMI; 943 value = APIC_DM_NMI;
564 else 944 else
565 value = APIC_DM_NMI | APIC_LVT_MASKED; 945 value = APIC_DM_NMI | APIC_LVT_MASKED;
566 if (!APIC_INTEGRATED(ver)) /* 82489DX */ 946 if (!integrated) /* 82489DX */
567 value |= APIC_LVT_LEVEL_TRIGGER; 947 value |= APIC_LVT_LEVEL_TRIGGER;
568 apic_write_around(APIC_LVT1, value); 948 apic_write_around(APIC_LVT1, value);
569 949
570 if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ 950 if (integrated && !esr_disable) { /* !82489DX */
571 maxlvt = get_maxlvt(); 951 maxlvt = lapic_get_maxlvt();
572 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 952 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
573 apic_write(APIC_ESR, 0); 953 apic_write(APIC_ESR, 0);
574 oldvalue = apic_read(APIC_ESR); 954 oldvalue = apic_read(APIC_ESR);
575 955
576 value = ERROR_APIC_VECTOR; // enables sending errors 956 /* enables sending errors */
957 value = ERROR_APIC_VECTOR;
577 apic_write_around(APIC_LVTERR, value); 958 apic_write_around(APIC_LVTERR, value);
578 /* 959 /*
579 * spec says clear errors after enabling vector. 960 * spec says clear errors after enabling vector.
@@ -586,207 +967,30 @@ void __devinit setup_local_APIC(void)
586 "vector: 0x%08lx after: 0x%08lx\n", 967 "vector: 0x%08lx after: 0x%08lx\n",
587 oldvalue, value); 968 oldvalue, value);
588 } else { 969 } else {
589 if (esr_disable) 970 if (esr_disable)
590 /* 971 /*
591 * Something untraceble is creating bad interrupts on 972 * Something untraceble is creating bad interrupts on
592 * secondary quads ... for the moment, just leave the 973 * secondary quads ... for the moment, just leave the
593 * ESR disabled - we can't do anything useful with the 974 * ESR disabled - we can't do anything useful with the
594 * errors anyway - mbligh 975 * errors anyway - mbligh
595 */ 976 */
596 printk("Leaving ESR disabled.\n"); 977 printk(KERN_INFO "Leaving ESR disabled.\n");
597 else 978 else
598 printk("No ESR for 82489DX.\n"); 979 printk(KERN_INFO "No ESR for 82489DX.\n");
599 } 980 }
600 981
982 /* Disable the local apic timer */
983 value = apic_read(APIC_LVTT);
984 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
985 apic_write_around(APIC_LVTT, value);
986
601 setup_apic_nmi_watchdog(NULL); 987 setup_apic_nmi_watchdog(NULL);
602 apic_pm_activate(); 988 apic_pm_activate();
603} 989}
604 990
605/* 991/*
606 * If Linux enabled the LAPIC against the BIOS default 992 * Detect and initialize APIC
607 * disable it down before re-entering the BIOS on shutdown.
608 * Otherwise the BIOS may get confused and not power-off.
609 * Additionally clear all LVT entries before disable_local_APIC
610 * for the case where Linux didn't enable the LAPIC.
611 */
612void lapic_shutdown(void)
613{
614 unsigned long flags;
615
616 if (!cpu_has_apic)
617 return;
618
619 local_irq_save(flags);
620 clear_local_APIC();
621
622 if (enabled_via_apicbase)
623 disable_local_APIC();
624
625 local_irq_restore(flags);
626}
627
628#ifdef CONFIG_PM
629
630static struct {
631 int active;
632 /* r/w apic fields */
633 unsigned int apic_id;
634 unsigned int apic_taskpri;
635 unsigned int apic_ldr;
636 unsigned int apic_dfr;
637 unsigned int apic_spiv;
638 unsigned int apic_lvtt;
639 unsigned int apic_lvtpc;
640 unsigned int apic_lvt0;
641 unsigned int apic_lvt1;
642 unsigned int apic_lvterr;
643 unsigned int apic_tmict;
644 unsigned int apic_tdcr;
645 unsigned int apic_thmr;
646} apic_pm_state;
647
648static int lapic_suspend(struct sys_device *dev, pm_message_t state)
649{
650 unsigned long flags;
651 int maxlvt;
652
653 if (!apic_pm_state.active)
654 return 0;
655
656 maxlvt = get_maxlvt();
657
658 apic_pm_state.apic_id = apic_read(APIC_ID);
659 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
660 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
661 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
662 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
663 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
664 if (maxlvt >= 4)
665 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
666 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
667 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
668 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
669 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
670 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
671#ifdef CONFIG_X86_MCE_P4THERMAL
672 if (maxlvt >= 5)
673 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
674#endif
675
676 local_irq_save(flags);
677 disable_local_APIC();
678 local_irq_restore(flags);
679 return 0;
680}
681
682static int lapic_resume(struct sys_device *dev)
683{
684 unsigned int l, h;
685 unsigned long flags;
686 int maxlvt;
687
688 if (!apic_pm_state.active)
689 return 0;
690
691 maxlvt = get_maxlvt();
692
693 local_irq_save(flags);
694
695 /*
696 * Make sure the APICBASE points to the right address
697 *
698 * FIXME! This will be wrong if we ever support suspend on
699 * SMP! We'll need to do this as part of the CPU restore!
700 */
701 rdmsr(MSR_IA32_APICBASE, l, h);
702 l &= ~MSR_IA32_APICBASE_BASE;
703 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
704 wrmsr(MSR_IA32_APICBASE, l, h);
705
706 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
707 apic_write(APIC_ID, apic_pm_state.apic_id);
708 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
709 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
710 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
711 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
712 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
713 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
714#ifdef CONFIG_X86_MCE_P4THERMAL
715 if (maxlvt >= 5)
716 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
717#endif
718 if (maxlvt >= 4)
719 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
720 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
721 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
722 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
723 apic_write(APIC_ESR, 0);
724 apic_read(APIC_ESR);
725 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
726 apic_write(APIC_ESR, 0);
727 apic_read(APIC_ESR);
728 local_irq_restore(flags);
729 return 0;
730}
731
732/*
733 * This device has no shutdown method - fully functioning local APICs
734 * are needed on every CPU up until machine_halt/restart/poweroff.
735 */ 993 */
736
737static struct sysdev_class lapic_sysclass = {
738 set_kset_name("lapic"),
739 .resume = lapic_resume,
740 .suspend = lapic_suspend,
741};
742
743static struct sys_device device_lapic = {
744 .id = 0,
745 .cls = &lapic_sysclass,
746};
747
748static void __devinit apic_pm_activate(void)
749{
750 apic_pm_state.active = 1;
751}
752
753static int __init init_lapic_sysfs(void)
754{
755 int error;
756
757 if (!cpu_has_apic)
758 return 0;
759 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
760
761 error = sysdev_class_register(&lapic_sysclass);
762 if (!error)
763 error = sysdev_register(&device_lapic);
764 return error;
765}
766device_initcall(init_lapic_sysfs);
767
768#else /* CONFIG_PM */
769
770static void apic_pm_activate(void) { }
771
772#endif /* CONFIG_PM */
773
774/*
775 * Detect and enable local APICs on non-SMP boards.
776 * Original code written by Keir Fraser.
777 */
778
779static int __init apic_set_verbosity(char *str)
780{
781 if (strcmp("debug", str) == 0)
782 apic_verbosity = APIC_DEBUG;
783 else if (strcmp("verbose", str) == 0)
784 apic_verbosity = APIC_VERBOSE;
785 return 1;
786}
787
788__setup("apic=", apic_set_verbosity);
789
790static int __init detect_init_APIC (void) 994static int __init detect_init_APIC (void)
791{ 995{
792 u32 h, l, features; 996 u32 h, l, features;
@@ -798,7 +1002,7 @@ static int __init detect_init_APIC (void)
798 switch (boot_cpu_data.x86_vendor) { 1002 switch (boot_cpu_data.x86_vendor) {
799 case X86_VENDOR_AMD: 1003 case X86_VENDOR_AMD:
800 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || 1004 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
801 (boot_cpu_data.x86 == 15)) 1005 (boot_cpu_data.x86 == 15))
802 break; 1006 break;
803 goto no_apic; 1007 goto no_apic;
804 case X86_VENDOR_INTEL: 1008 case X86_VENDOR_INTEL:
@@ -812,23 +1016,23 @@ static int __init detect_init_APIC (void)
812 1016
813 if (!cpu_has_apic) { 1017 if (!cpu_has_apic) {
814 /* 1018 /*
815 * Over-ride BIOS and try to enable the local 1019 * Over-ride BIOS and try to enable the local APIC only if
816 * APIC only if "lapic" specified. 1020 * "lapic" specified.
817 */ 1021 */
818 if (enable_local_apic <= 0) { 1022 if (enable_local_apic <= 0) {
819 printk("Local APIC disabled by BIOS -- " 1023 printk(KERN_INFO "Local APIC disabled by BIOS -- "
820 "you can enable it with \"lapic\"\n"); 1024 "you can enable it with \"lapic\"\n");
821 return -1; 1025 return -1;
822 } 1026 }
823 /* 1027 /*
824 * Some BIOSes disable the local APIC in the 1028 * Some BIOSes disable the local APIC in the APIC_BASE
825 * APIC_BASE MSR. This can only be done in 1029 * MSR. This can only be done in software for Intel P6 or later
826 * software for Intel P6 or later and AMD K7 1030 * and AMD K7 (Model > 1) or later.
827 * (Model > 1) or later.
828 */ 1031 */
829 rdmsr(MSR_IA32_APICBASE, l, h); 1032 rdmsr(MSR_IA32_APICBASE, l, h);
830 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1033 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
831 printk("Local APIC disabled by BIOS -- reenabling.\n"); 1034 printk(KERN_INFO
1035 "Local APIC disabled by BIOS -- reenabling.\n");
832 l &= ~MSR_IA32_APICBASE_BASE; 1036 l &= ~MSR_IA32_APICBASE_BASE;
833 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1037 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
834 wrmsr(MSR_IA32_APICBASE, l, h); 1038 wrmsr(MSR_IA32_APICBASE, l, h);
@@ -841,7 +1045,7 @@ static int __init detect_init_APIC (void)
841 */ 1045 */
842 features = cpuid_edx(1); 1046 features = cpuid_edx(1);
843 if (!(features & (1 << X86_FEATURE_APIC))) { 1047 if (!(features & (1 << X86_FEATURE_APIC))) {
844 printk("Could not enable APIC!\n"); 1048 printk(KERN_WARNING "Could not enable APIC!\n");
845 return -1; 1049 return -1;
846 } 1050 }
847 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1051 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
@@ -855,17 +1059,20 @@ static int __init detect_init_APIC (void)
855 if (nmi_watchdog != NMI_NONE) 1059 if (nmi_watchdog != NMI_NONE)
856 nmi_watchdog = NMI_LOCAL_APIC; 1060 nmi_watchdog = NMI_LOCAL_APIC;
857 1061
858 printk("Found and enabled local APIC!\n"); 1062 printk(KERN_INFO "Found and enabled local APIC!\n");
859 1063
860 apic_pm_activate(); 1064 apic_pm_activate();
861 1065
862 return 0; 1066 return 0;
863 1067
864no_apic: 1068no_apic:
865 printk("No local APIC present or hardware disabled\n"); 1069 printk(KERN_INFO "No local APIC present or hardware disabled\n");
866 return -1; 1070 return -1;
867} 1071}
868 1072
1073/**
1074 * init_apic_mappings - initialize APIC mappings
1075 */
869void __init init_apic_mappings(void) 1076void __init init_apic_mappings(void)
870{ 1077{
871 unsigned long apic_phys; 1078 unsigned long apic_phys;
@@ -925,385 +1132,92 @@ fake_ioapic_page:
925} 1132}
926 1133
927/* 1134/*
928 * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts 1135 * This initializes the IO-APIC and APIC hardware if this is
929 * per second. We assume that the caller has already set up the local 1136 * a UP kernel.
930 * APIC.
931 *
932 * The APIC timer is not exactly sync with the external timer chip, it
933 * closely follows bus clocks.
934 */
935
936/*
937 * The timer chip is already set up at HZ interrupts per second here,
938 * but we do not accept timer interrupts yet. We only allow the BP
939 * to calibrate.
940 */
941static unsigned int __devinit get_8254_timer_count(void)
942{
943 unsigned long flags;
944
945 unsigned int count;
946
947 spin_lock_irqsave(&i8253_lock, flags);
948
949 outb_p(0x00, PIT_MODE);
950 count = inb_p(PIT_CH0);
951 count |= inb_p(PIT_CH0) << 8;
952
953 spin_unlock_irqrestore(&i8253_lock, flags);
954
955 return count;
956}
957
958/* next tick in 8254 can be caught by catching timer wraparound */
959static void __devinit wait_8254_wraparound(void)
960{
961 unsigned int curr_count, prev_count;
962
963 curr_count = get_8254_timer_count();
964 do {
965 prev_count = curr_count;
966 curr_count = get_8254_timer_count();
967
968 /* workaround for broken Mercury/Neptune */
969 if (prev_count >= curr_count + 0x100)
970 curr_count = get_8254_timer_count();
971
972 } while (prev_count >= curr_count);
973}
974
975/*
976 * Default initialization for 8254 timers. If we use other timers like HPET,
977 * we override this later
978 */
979void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound;
980
981/*
982 * This function sets up the local APIC timer, with a timeout of
983 * 'clocks' APIC bus clock. During calibration we actually call
984 * this function twice on the boot CPU, once with a bogus timeout
985 * value, second time for real. The other (noncalibrating) CPUs
986 * call this function only once, with the real, calibrated value.
987 *
988 * We do reads before writes even if unnecessary, to get around the
989 * P5 APIC double write bug.
990 */ 1137 */
991 1138int __init APIC_init_uniprocessor (void)
992#define APIC_DIVISOR 16
993
994static void __setup_APIC_LVTT(unsigned int clocks)
995{ 1139{
996 unsigned int lvtt_value, tmp_value, ver; 1140 if (enable_local_apic < 0)
997 int cpu = smp_processor_id(); 1141 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
998
999 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
1000 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
1001 if (!APIC_INTEGRATED(ver))
1002 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
1003
1004 if (cpu_isset(cpu, timer_bcast_ipi))
1005 lvtt_value |= APIC_LVT_MASKED;
1006 1142
1007 apic_write_around(APIC_LVTT, lvtt_value); 1143 if (!smp_found_config && !cpu_has_apic)
1144 return -1;
1008 1145
1009 /* 1146 /*
1010 * Divide PICLK by 16 1147 * Complain if the BIOS pretends there is one.
1011 */ 1148 */
1012 tmp_value = apic_read(APIC_TDCR); 1149 if (!cpu_has_apic &&
1013 apic_write_around(APIC_TDCR, (tmp_value 1150 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1014 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 1151 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1015 | APIC_TDR_DIV_16); 1152 boot_cpu_physical_apicid);
1016 1153 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1017 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 1154 return -1;
1018} 1155 }
1019 1156
1020static void __devinit setup_APIC_timer(unsigned int clocks) 1157 verify_local_APIC();
1021{
1022 unsigned long flags;
1023 1158
1024 local_irq_save(flags); 1159 connect_bsp_APIC();
1025 1160
1026 /* 1161 /*
1027 * Wait for IRQ0's slice: 1162 * Hack: In case of kdump, after a crash, kernel might be booting
1163 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1164 * might be zero if read from MP tables. Get it from LAPIC.
1028 */ 1165 */
1029 wait_timer_tick(); 1166#ifdef CONFIG_CRASH_DUMP
1167 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
1168#endif
1169 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
1030 1170
1031 __setup_APIC_LVTT(clocks); 1171 setup_local_APIC();
1032 1172
1033 local_irq_restore(flags); 1173#ifdef CONFIG_X86_IO_APIC
1174 if (smp_found_config)
1175 if (!skip_ioapic_setup && nr_ioapics)
1176 setup_IO_APIC();
1177#endif
1178 setup_boot_clock();
1179
1180 return 0;
1034} 1181}
1035 1182
1036/* 1183/*
1037 * In this function we calibrate APIC bus clocks to the external 1184 * APIC command line parameters
1038 * timer. Unfortunately we cannot use jiffies and the timer irq
1039 * to calibrate, since some later bootup code depends on getting
1040 * the first irq? Ugh.
1041 *
1042 * We want to do the calibration only once since we
1043 * want to have local timer irqs syncron. CPUs connected
1044 * by the same APIC bus have the very same bus frequency.
1045 * And we want to have irqs off anyways, no accidental
1046 * APIC irq that way.
1047 */ 1185 */
1048 1186static int __init parse_lapic(char *arg)
1049static int __init calibrate_APIC_clock(void)
1050{
1051 unsigned long long t1 = 0, t2 = 0;
1052 long tt1, tt2;
1053 long result;
1054 int i;
1055 const int LOOPS = HZ/10;
1056
1057 apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
1058
1059 /*
1060 * Put whatever arbitrary (but long enough) timeout
1061 * value into the APIC clock, we just want to get the
1062 * counter running for calibration.
1063 */
1064 __setup_APIC_LVTT(1000000000);
1065
1066 /*
1067 * The timer chip counts down to zero. Let's wait
1068 * for a wraparound to start exact measurement:
1069 * (the current tick might have been already half done)
1070 */
1071
1072 wait_timer_tick();
1073
1074 /*
1075 * We wrapped around just now. Let's start:
1076 */
1077 if (cpu_has_tsc)
1078 rdtscll(t1);
1079 tt1 = apic_read(APIC_TMCCT);
1080
1081 /*
1082 * Let's wait LOOPS wraprounds:
1083 */
1084 for (i = 0; i < LOOPS; i++)
1085 wait_timer_tick();
1086
1087 tt2 = apic_read(APIC_TMCCT);
1088 if (cpu_has_tsc)
1089 rdtscll(t2);
1090
1091 /*
1092 * The APIC bus clock counter is 32 bits only, it
1093 * might have overflown, but note that we use signed
1094 * longs, thus no extra care needed.
1095 *
1096 * underflown to be exact, as the timer counts down ;)
1097 */
1098
1099 result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
1100
1101 if (cpu_has_tsc)
1102 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
1103 "%ld.%04ld MHz.\n",
1104 ((long)(t2-t1)/LOOPS)/(1000000/HZ),
1105 ((long)(t2-t1)/LOOPS)%(1000000/HZ));
1106
1107 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
1108 "%ld.%04ld MHz.\n",
1109 result/(1000000/HZ),
1110 result%(1000000/HZ));
1111
1112 return result;
1113}
1114
1115static unsigned int calibration_result;
1116
1117void __init setup_boot_APIC_clock(void)
1118{
1119 unsigned long flags;
1120 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
1121 using_apic_timer = 1;
1122
1123 local_irq_save(flags);
1124
1125 calibration_result = calibrate_APIC_clock();
1126 /*
1127 * Now set up the timer for real.
1128 */
1129 setup_APIC_timer(calibration_result);
1130
1131 local_irq_restore(flags);
1132}
1133
1134void __devinit setup_secondary_APIC_clock(void)
1135{
1136 setup_APIC_timer(calibration_result);
1137}
1138
1139void disable_APIC_timer(void)
1140{
1141 if (using_apic_timer) {
1142 unsigned long v;
1143
1144 v = apic_read(APIC_LVTT);
1145 /*
1146 * When an illegal vector value (0-15) is written to an LVT
1147 * entry and delivery mode is Fixed, the APIC may signal an
1148 * illegal vector error, with out regard to whether the mask
1149 * bit is set or whether an interrupt is actually seen on input.
1150 *
1151 * Boot sequence might call this function when the LVTT has
1152 * '0' vector value. So make sure vector field is set to
1153 * valid value.
1154 */
1155 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1156 apic_write_around(APIC_LVTT, v);
1157 }
1158}
1159
1160void enable_APIC_timer(void)
1161{ 1187{
1162 int cpu = smp_processor_id(); 1188 enable_local_apic = 1;
1163 1189 return 0;
1164 if (using_apic_timer &&
1165 !cpu_isset(cpu, timer_bcast_ipi)) {
1166 unsigned long v;
1167
1168 v = apic_read(APIC_LVTT);
1169 apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
1170 }
1171} 1190}
1191early_param("lapic", parse_lapic);
1172 1192
1173void switch_APIC_timer_to_ipi(void *cpumask) 1193static int __init parse_nolapic(char *arg)
1174{ 1194{
1175 cpumask_t mask = *(cpumask_t *)cpumask; 1195 enable_local_apic = -1;
1176 int cpu = smp_processor_id(); 1196 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1177 1197 return 0;
1178 if (cpu_isset(cpu, mask) &&
1179 !cpu_isset(cpu, timer_bcast_ipi)) {
1180 disable_APIC_timer();
1181 cpu_set(cpu, timer_bcast_ipi);
1182 }
1183} 1198}
1184EXPORT_SYMBOL(switch_APIC_timer_to_ipi); 1199early_param("nolapic", parse_nolapic);
1185 1200
1186void switch_ipi_to_APIC_timer(void *cpumask) 1201static int __init apic_set_verbosity(char *str)
1187{ 1202{
1188 cpumask_t mask = *(cpumask_t *)cpumask; 1203 if (strcmp("debug", str) == 0)
1189 int cpu = smp_processor_id(); 1204 apic_verbosity = APIC_DEBUG;
1190 1205 else if (strcmp("verbose", str) == 0)
1191 if (cpu_isset(cpu, mask) && 1206 apic_verbosity = APIC_VERBOSE;
1192 cpu_isset(cpu, timer_bcast_ipi)) { 1207 return 1;
1193 cpu_clear(cpu, timer_bcast_ipi);
1194 enable_APIC_timer();
1195 }
1196} 1208}
1197EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
1198 1209
1199#undef APIC_DIVISOR 1210__setup("apic=", apic_set_verbosity);
1200
1201/*
1202 * Local timer interrupt handler. It does both profiling and
1203 * process statistics/rescheduling.
1204 *
1205 * We do profiling in every local tick, statistics/rescheduling
1206 * happen only every 'profiling multiplier' ticks. The default
1207 * multiplier is 1 and it can be changed by writing the new multiplier
1208 * value into /proc/profile.
1209 */
1210
1211inline void smp_local_timer_interrupt(void)
1212{
1213 profile_tick(CPU_PROFILING);
1214#ifdef CONFIG_SMP
1215 update_process_times(user_mode_vm(get_irq_regs()));
1216#endif
1217 1211
1218 /*
1219 * We take the 'long' return path, and there every subsystem
1220 * grabs the apropriate locks (kernel lock/ irq lock).
1221 *
1222 * we might want to decouple profiling from the 'long path',
1223 * and do the profiling totally in assembly.
1224 *
1225 * Currently this isn't too much of an issue (performance wise),
1226 * we can take more than 100K local irqs per second on a 100 MHz P5.
1227 */
1228}
1229 1212
1230/* 1213/*
1231 * Local APIC timer interrupt. This is the most natural way for doing 1214 * Local APIC interrupts
1232 * local interrupts, but local timer interrupts can be emulated by
1233 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1234 *
1235 * [ if a single-CPU system runs an SMP kernel then we call the local
1236 * interrupt as well. Thus we cannot inline the local irq ... ]
1237 */ 1215 */
1238 1216
1239fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
1240{
1241 struct pt_regs *old_regs = set_irq_regs(regs);
1242 int cpu = smp_processor_id();
1243
1244 /*
1245 * the NMI deadlock-detector uses this.
1246 */
1247 per_cpu(irq_stat, cpu).apic_timer_irqs++;
1248
1249 /*
1250 * NOTE! We'd better ACK the irq immediately,
1251 * because timer handling can be slow.
1252 */
1253 ack_APIC_irq();
1254 /*
1255 * update_process_times() expects us to have done irq_enter().
1256 * Besides, if we don't timer interrupts ignore the global
1257 * interrupt lock, which is the WrongThing (tm) to do.
1258 */
1259 exit_idle();
1260 irq_enter();
1261 smp_local_timer_interrupt();
1262 irq_exit();
1263 set_irq_regs(old_regs);
1264}
1265
1266#ifndef CONFIG_SMP
1267static void up_apic_timer_interrupt_call(void)
1268{
1269 int cpu = smp_processor_id();
1270
1271 /*
1272 * the NMI deadlock-detector uses this.
1273 */
1274 per_cpu(irq_stat, cpu).apic_timer_irqs++;
1275
1276 smp_local_timer_interrupt();
1277}
1278#endif
1279
1280void smp_send_timer_broadcast_ipi(void)
1281{
1282 cpumask_t mask;
1283
1284 cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1285 if (!cpus_empty(mask)) {
1286#ifdef CONFIG_SMP
1287 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1288#else
1289 /*
1290 * We can directly call the apic timer interrupt handler
1291 * in UP case. Minus all irq related functions
1292 */
1293 up_apic_timer_interrupt_call();
1294#endif
1295 }
1296}
1297
1298int setup_profiling_timer(unsigned int multiplier)
1299{
1300 return -EINVAL;
1301}
1302
1303/* 1217/*
1304 * This interrupt should _never_ happen with our APIC/SMP architecture 1218 * This interrupt should _never_ happen with our APIC/SMP architecture
1305 */ 1219 */
1306fastcall void smp_spurious_interrupt(struct pt_regs *regs) 1220void smp_spurious_interrupt(struct pt_regs *regs)
1307{ 1221{
1308 unsigned long v; 1222 unsigned long v;
1309 1223
@@ -1319,16 +1233,15 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs)
1319 ack_APIC_irq(); 1233 ack_APIC_irq();
1320 1234
1321 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1235 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1322 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", 1236 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1323 smp_processor_id()); 1237 "should never happen.\n", smp_processor_id());
1324 irq_exit(); 1238 irq_exit();
1325} 1239}
1326 1240
1327/* 1241/*
1328 * This interrupt should never happen with our APIC/SMP architecture 1242 * This interrupt should never happen with our APIC/SMP architecture
1329 */ 1243 */
1330 1244void smp_error_interrupt(struct pt_regs *regs)
1331fastcall void smp_error_interrupt(struct pt_regs *regs)
1332{ 1245{
1333 unsigned long v, v1; 1246 unsigned long v, v1;
1334 1247
@@ -1352,69 +1265,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs)
1352 7: Illegal register address 1265 7: Illegal register address
1353 */ 1266 */
1354 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1267 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1355 smp_processor_id(), v , v1); 1268 smp_processor_id(), v , v1);
1356 irq_exit(); 1269 irq_exit();
1357} 1270}
1358 1271
1359/* 1272/*
1360 * This initializes the IO-APIC and APIC hardware if this is 1273 * Initialize APIC interrupts
1361 * a UP kernel.
1362 */ 1274 */
1363int __init APIC_init_uniprocessor (void) 1275void __init apic_intr_init(void)
1364{ 1276{
1365 if (enable_local_apic < 0) 1277#ifdef CONFIG_SMP
1366 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1278 smp_intr_init();
1279#endif
1280 /* self generated IPI for local APIC timer */
1281 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1367 1282
1368 if (!smp_found_config && !cpu_has_apic) 1283 /* IPI vectors for APIC spurious and error interrupts */
1369 return -1; 1284 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1285 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1370 1286
1371 /* 1287 /* thermal monitor LVT interrupt */
1372 * Complain if the BIOS pretends there is one. 1288#ifdef CONFIG_X86_MCE_P4THERMAL
1373 */ 1289 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1374 if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1290#endif
1375 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1291}
1376 boot_cpu_physical_apicid); 1292
1377 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1293/**
1378 return -1; 1294 * connect_bsp_APIC - attach the APIC to the interrupt system
1295 */
1296void __init connect_bsp_APIC(void)
1297{
1298 if (pic_mode) {
1299 /*
1300 * Do not trust the local APIC being empty at bootup.
1301 */
1302 clear_local_APIC();
1303 /*
1304 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1305 * local APIC to INT and NMI lines.
1306 */
1307 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1308 "enabling APIC mode.\n");
1309 outb(0x70, 0x22);
1310 outb(0x01, 0x23);
1379 } 1311 }
1312 enable_apic_mode();
1313}
1380 1314
1381 verify_local_APIC(); 1315/**
1316 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1317 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1318 *
1319 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1320 * APIC is disabled.
1321 */
1322void disconnect_bsp_APIC(int virt_wire_setup)
1323{
1324 if (pic_mode) {
1325 /*
1326 * Put the board back into PIC mode (has an effect only on
1327 * certain older boards). Note that APIC interrupts, including
1328 * IPIs, won't work beyond this point! The only exception are
1329 * INIT IPIs.
1330 */
1331 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1332 "entering PIC mode.\n");
1333 outb(0x70, 0x22);
1334 outb(0x00, 0x23);
1335 } else {
1336 /* Go back to Virtual Wire compatibility mode */
1337 unsigned long value;
1382 1338
1383 connect_bsp_APIC(); 1339 /* For the spurious interrupt use vector F, and enable it */
1340 value = apic_read(APIC_SPIV);
1341 value &= ~APIC_VECTOR_MASK;
1342 value |= APIC_SPIV_APIC_ENABLED;
1343 value |= 0xf;
1344 apic_write_around(APIC_SPIV, value);
1384 1345
1385 /* 1346 if (!virt_wire_setup) {
1386 * Hack: In case of kdump, after a crash, kernel might be booting 1347 /*
1387 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid 1348 * For LVT0 make it edge triggered, active high,
1388 * might be zero if read from MP tables. Get it from LAPIC. 1349 * external and enabled
1389 */ 1350 */
1390#ifdef CONFIG_CRASH_DUMP 1351 value = apic_read(APIC_LVT0);
1391 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); 1352 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1392#endif 1353 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1393 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 1354 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
1355 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1356 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1357 apic_write_around(APIC_LVT0, value);
1358 } else {
1359 /* Disable LVT0 */
1360 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1361 }
1394 1362
1395 setup_local_APIC(); 1363 /*
1364 * For LVT1 make it edge triggered, active high, nmi and
1365 * enabled
1366 */
1367 value = apic_read(APIC_LVT1);
1368 value &= ~(
1369 APIC_MODE_MASK | APIC_SEND_PENDING |
1370 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1371 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1372 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1373 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1374 apic_write_around(APIC_LVT1, value);
1375 }
1376}
1396 1377
1397#ifdef CONFIG_X86_IO_APIC 1378/*
1398 if (smp_found_config) 1379 * Power management
1399 if (!skip_ioapic_setup && nr_ioapics) 1380 */
1400 setup_IO_APIC(); 1381#ifdef CONFIG_PM
1382
1383static struct {
1384 int active;
1385 /* r/w apic fields */
1386 unsigned int apic_id;
1387 unsigned int apic_taskpri;
1388 unsigned int apic_ldr;
1389 unsigned int apic_dfr;
1390 unsigned int apic_spiv;
1391 unsigned int apic_lvtt;
1392 unsigned int apic_lvtpc;
1393 unsigned int apic_lvt0;
1394 unsigned int apic_lvt1;
1395 unsigned int apic_lvterr;
1396 unsigned int apic_tmict;
1397 unsigned int apic_tdcr;
1398 unsigned int apic_thmr;
1399} apic_pm_state;
1400
1401static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1402{
1403 unsigned long flags;
1404 int maxlvt;
1405
1406 if (!apic_pm_state.active)
1407 return 0;
1408
1409 maxlvt = lapic_get_maxlvt();
1410
1411 apic_pm_state.apic_id = apic_read(APIC_ID);
1412 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1413 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1414 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1415 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1416 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1417 if (maxlvt >= 4)
1418 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1419 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1420 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1421 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1422 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1423 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1424#ifdef CONFIG_X86_MCE_P4THERMAL
1425 if (maxlvt >= 5)
1426 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1401#endif 1427#endif
1402 setup_boot_clock();
1403 1428
1429 local_irq_save(flags);
1430 disable_local_APIC();
1431 local_irq_restore(flags);
1404 return 0; 1432 return 0;
1405} 1433}
1406 1434
1407static int __init parse_lapic(char *arg) 1435static int lapic_resume(struct sys_device *dev)
1408{ 1436{
1409 lapic_enable(); 1437 unsigned int l, h;
1438 unsigned long flags;
1439 int maxlvt;
1440
1441 if (!apic_pm_state.active)
1442 return 0;
1443
1444 maxlvt = lapic_get_maxlvt();
1445
1446 local_irq_save(flags);
1447
1448 /*
1449 * Make sure the APICBASE points to the right address
1450 *
1451 * FIXME! This will be wrong if we ever support suspend on
1452 * SMP! We'll need to do this as part of the CPU restore!
1453 */
1454 rdmsr(MSR_IA32_APICBASE, l, h);
1455 l &= ~MSR_IA32_APICBASE_BASE;
1456 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1457 wrmsr(MSR_IA32_APICBASE, l, h);
1458
1459 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1460 apic_write(APIC_ID, apic_pm_state.apic_id);
1461 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1462 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1463 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1464 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1465 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1466 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1467#ifdef CONFIG_X86_MCE_P4THERMAL
1468 if (maxlvt >= 5)
1469 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1470#endif
1471 if (maxlvt >= 4)
1472 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1473 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1474 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1475 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1476 apic_write(APIC_ESR, 0);
1477 apic_read(APIC_ESR);
1478 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1479 apic_write(APIC_ESR, 0);
1480 apic_read(APIC_ESR);
1481 local_irq_restore(flags);
1410 return 0; 1482 return 0;
1411} 1483}
1412early_param("lapic", parse_lapic);
1413 1484
1414static int __init parse_nolapic(char *arg) 1485/*
1486 * This device has no shutdown method - fully functioning local APICs
1487 * are needed on every CPU up until machine_halt/restart/poweroff.
1488 */
1489
1490static struct sysdev_class lapic_sysclass = {
1491 set_kset_name("lapic"),
1492 .resume = lapic_resume,
1493 .suspend = lapic_suspend,
1494};
1495
1496static struct sys_device device_lapic = {
1497 .id = 0,
1498 .cls = &lapic_sysclass,
1499};
1500
1501static void __devinit apic_pm_activate(void)
1415{ 1502{
1416 lapic_disable(); 1503 apic_pm_state.active = 1;
1417 return 0;
1418} 1504}
1419early_param("nolapic", parse_nolapic);
1420 1505
1506static int __init init_lapic_sysfs(void)
1507{
1508 int error;
1509
1510 if (!cpu_has_apic)
1511 return 0;
1512 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1513
1514 error = sysdev_class_register(&lapic_sysclass);
1515 if (!error)
1516 error = sysdev_register(&device_lapic);
1517 return error;
1518}
1519device_initcall(init_lapic_sysfs);
1520
1521#else /* CONFIG_PM */
1522
1523static void apic_pm_activate(void) { }
1524
1525#endif /* CONFIG_PM */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index f9ba0af7ee1f..064bbf2861f4 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -236,7 +236,6 @@
236 236
237#include "io_ports.h" 237#include "io_ports.h"
238 238
239extern unsigned long get_cmos_time(void);
240extern void machine_real_restart(unsigned char *, int); 239extern void machine_real_restart(unsigned char *, int);
241 240
242#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 241#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
@@ -1176,28 +1175,6 @@ out:
1176 spin_unlock(&user_list_lock); 1175 spin_unlock(&user_list_lock);
1177} 1176}
1178 1177
1179static void set_time(void)
1180{
1181 struct timespec ts;
1182 if (got_clock_diff) { /* Must know time zone in order to set clock */
1183 ts.tv_sec = get_cmos_time() + clock_cmos_diff;
1184 ts.tv_nsec = 0;
1185 do_settimeofday(&ts);
1186 }
1187}
1188
1189static void get_time_diff(void)
1190{
1191#ifndef CONFIG_APM_RTC_IS_GMT
1192 /*
1193 * Estimate time zone so that set_time can update the clock
1194 */
1195 clock_cmos_diff = -get_cmos_time();
1196 clock_cmos_diff += get_seconds();
1197 got_clock_diff = 1;
1198#endif
1199}
1200
1201static void reinit_timer(void) 1178static void reinit_timer(void)
1202{ 1179{
1203#ifdef INIT_TIMER_AFTER_SUSPEND 1180#ifdef INIT_TIMER_AFTER_SUSPEND
@@ -1237,19 +1214,6 @@ static int suspend(int vetoable)
1237 local_irq_disable(); 1214 local_irq_disable();
1238 device_power_down(PMSG_SUSPEND); 1215 device_power_down(PMSG_SUSPEND);
1239 1216
1240 /* serialize with the timer interrupt */
1241 write_seqlock(&xtime_lock);
1242
1243 /* protect against access to timer chip registers */
1244 spin_lock(&i8253_lock);
1245
1246 get_time_diff();
1247 /*
1248 * Irq spinlock must be dropped around set_system_power_state.
1249 * We'll undo any timer changes due to interrupts below.
1250 */
1251 spin_unlock(&i8253_lock);
1252 write_sequnlock(&xtime_lock);
1253 local_irq_enable(); 1217 local_irq_enable();
1254 1218
1255 save_processor_state(); 1219 save_processor_state();
@@ -1258,7 +1222,6 @@ static int suspend(int vetoable)
1258 restore_processor_state(); 1222 restore_processor_state();
1259 1223
1260 local_irq_disable(); 1224 local_irq_disable();
1261 set_time();
1262 reinit_timer(); 1225 reinit_timer();
1263 1226
1264 if (err == APM_NO_ERROR) 1227 if (err == APM_NO_ERROR)
@@ -1288,11 +1251,6 @@ static void standby(void)
1288 1251
1289 local_irq_disable(); 1252 local_irq_disable();
1290 device_power_down(PMSG_SUSPEND); 1253 device_power_down(PMSG_SUSPEND);
1291 /* serialize with the timer interrupt */
1292 write_seqlock(&xtime_lock);
1293 /* If needed, notify drivers here */
1294 get_time_diff();
1295 write_sequnlock(&xtime_lock);
1296 local_irq_enable(); 1254 local_irq_enable();
1297 1255
1298 err = set_system_power_state(APM_STATE_STANDBY); 1256 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1386,7 +1344,6 @@ static void check_events(void)
1386 ignore_bounce = 1; 1344 ignore_bounce = 1;
1387 if ((event != APM_NORMAL_RESUME) 1345 if ((event != APM_NORMAL_RESUME)
1388 || (ignore_normal_resume == 0)) { 1346 || (ignore_normal_resume == 0)) {
1389 set_time();
1390 device_resume(); 1347 device_resume();
1391 pm_send_all(PM_RESUME, (void *)0); 1348 pm_send_all(PM_RESUME, (void *)0);
1392 queue_event(event, NULL); 1349 queue_event(event, NULL);
@@ -1402,7 +1359,6 @@ static void check_events(void)
1402 break; 1359 break;
1403 1360
1404 case APM_UPDATE_TIME: 1361 case APM_UPDATE_TIME:
1405 set_time();
1406 break; 1362 break;
1407 1363
1408 case APM_CRITICAL_SUSPEND: 1364 case APM_CRITICAL_SUSPEND:
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
index 5299c5bf4454..6c52182ca323 100644
--- a/arch/i386/kernel/cpu/cpufreq/Kconfig
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -217,6 +217,15 @@ config X86_LONGHAUL
217 217
218 If in doubt, say N. 218 If in doubt, say N.
219 219
220config X86_E_POWERSAVER
221 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
222 select CPU_FREQ_TABLE
223 depends on EXPERIMENTAL
224 help
225 This adds the CPUFreq driver for VIA C7 processors.
226
227 If in doubt, say N.
228
220comment "shared options" 229comment "shared options"
221 230
222config X86_ACPI_CPUFREQ_PROC_INTF 231config X86_ACPI_CPUFREQ_PROC_INTF
diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile
index 8de3abe322a9..560f7760dae5 100644
--- a/arch/i386/kernel/cpu/cpufreq/Makefile
+++ b/arch/i386/kernel/cpu/cpufreq/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
5obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
5obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o 6obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
6obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o 7obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
7obj-$(CONFIG_X86_LONGRUN) += longrun.o 8obj-$(CONFIG_X86_LONGRUN) += longrun.o
diff --git a/arch/i386/kernel/cpu/cpufreq/e_powersaver.c b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c
new file mode 100644
index 000000000000..f43d98e11cc7
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c
@@ -0,0 +1,334 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15
16#include <asm/msr.h>
17#include <asm/tsc.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20#include <asm/delay.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26
27struct eps_cpu_data {
28 u32 fsb;
29 struct cpufreq_frequency_table freq_table[];
30};
31
32static struct eps_cpu_data *eps_cpu[NR_CPUS];
33
34
35static unsigned int eps_get(unsigned int cpu)
36{
37 struct eps_cpu_data *centaur;
38 u32 lo, hi;
39
40 if (cpu)
41 return 0;
42 centaur = eps_cpu[cpu];
43 if (centaur == NULL)
44 return 0;
45
46 /* Return current frequency */
47 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
48 return centaur->fsb * ((lo >> 8) & 0xff);
49}
50
51static int eps_set_state(struct eps_cpu_data *centaur,
52 unsigned int cpu,
53 u32 dest_state)
54{
55 struct cpufreq_freqs freqs;
56 u32 lo, hi;
57 int err = 0;
58 int i;
59
60 freqs.old = eps_get(cpu);
61 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
62 freqs.cpu = cpu;
63 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
64
65 /* Wait while CPU is busy */
66 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
67 i = 0;
68 while (lo & ((1 << 16) | (1 << 17))) {
69 udelay(16);
70 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
71 i++;
72 if (unlikely(i > 64)) {
73 err = -ENODEV;
74 goto postchange;
75 }
76 }
77 /* Set new multiplier and voltage */
78 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
79 /* Wait until transition end */
80 i = 0;
81 do {
82 udelay(16);
83 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
84 i++;
85 if (unlikely(i > 64)) {
86 err = -ENODEV;
87 goto postchange;
88 }
89 } while (lo & ((1 << 16) | (1 << 17)));
90
91 /* Return current frequency */
92postchange:
93 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
94 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
95
96 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
97 return err;
98}
99
100static int eps_target(struct cpufreq_policy *policy,
101 unsigned int target_freq,
102 unsigned int relation)
103{
104 struct eps_cpu_data *centaur;
105 unsigned int newstate = 0;
106 unsigned int cpu = policy->cpu;
107 unsigned int dest_state;
108 int ret;
109
110 if (unlikely(eps_cpu[cpu] == NULL))
111 return -ENODEV;
112 centaur = eps_cpu[cpu];
113
114 if (unlikely(cpufreq_frequency_table_target(policy,
115 &eps_cpu[cpu]->freq_table[0],
116 target_freq,
117 relation,
118 &newstate))) {
119 return -EINVAL;
120 }
121
122 /* Make frequency transition */
123 dest_state = centaur->freq_table[newstate].index & 0xffff;
124 ret = eps_set_state(centaur, cpu, dest_state);
125 if (ret)
126 printk(KERN_ERR "eps: Timeout!\n");
127 return ret;
128}
129
130static int eps_verify(struct cpufreq_policy *policy)
131{
132 return cpufreq_frequency_table_verify(policy,
133 &eps_cpu[policy->cpu]->freq_table[0]);
134}
135
136static int eps_cpu_init(struct cpufreq_policy *policy)
137{
138 unsigned int i;
139 u32 lo, hi;
140 u64 val;
141 u8 current_multiplier, current_voltage;
142 u8 max_multiplier, max_voltage;
143 u8 min_multiplier, min_voltage;
144 u8 brand;
145 u32 fsb;
146 struct eps_cpu_data *centaur;
147 struct cpufreq_frequency_table *f_table;
148 int k, step, voltage;
149 int ret;
150 int states;
151
152 if (policy->cpu != 0)
153 return -ENODEV;
154
155 /* Check brand */
156 printk("eps: Detected VIA ");
157 rdmsr(0x1153, lo, hi);
158 brand = (((lo >> 2) ^ lo) >> 18) & 3;
159 switch(brand) {
160 case EPS_BRAND_C7M:
161 printk("C7-M\n");
162 break;
163 case EPS_BRAND_C7:
164 printk("C7\n");
165 break;
166 case EPS_BRAND_EDEN:
167 printk("Eden\n");
168 break;
169 case EPS_BRAND_C3:
170 printk("C3\n");
171 return -ENODEV;
172 break;
173 }
174 /* Enable Enhanced PowerSaver */
175 rdmsrl(MSR_IA32_MISC_ENABLE, val);
176 if (!(val & 1 << 16)) {
177 val |= 1 << 16;
178 wrmsrl(MSR_IA32_MISC_ENABLE, val);
179 /* Can be locked at 0 */
180 rdmsrl(MSR_IA32_MISC_ENABLE, val);
181 if (!(val & 1 << 16)) {
182 printk("eps: Can't enable Enhanced PowerSaver\n");
183 return -ENODEV;
184 }
185 }
186
187 /* Print voltage and multiplier */
188 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
189 current_voltage = lo & 0xff;
190 printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
191 current_multiplier = (lo >> 8) & 0xff;
192 printk("eps: Current multiplier = %d\n", current_multiplier);
193
194 /* Print limits */
195 max_voltage = hi & 0xff;
196 printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
197 max_multiplier = (hi >> 8) & 0xff;
198 printk("eps: Highest multiplier = %d\n", max_multiplier);
199 min_voltage = (hi >> 16) & 0xff;
200 printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
201 min_multiplier = (hi >> 24) & 0xff;
202 printk("eps: Lowest multiplier = %d\n", min_multiplier);
203
204 /* Sanity checks */
205 if (current_multiplier == 0 || max_multiplier == 0
206 || min_multiplier == 0)
207 return -EINVAL;
208 if (current_multiplier > max_multiplier
209 || max_multiplier <= min_multiplier)
210 return -EINVAL;
211 if (current_voltage > 0x1c || max_voltage > 0x1c)
212 return -EINVAL;
213 if (max_voltage < min_voltage)
214 return -EINVAL;
215
216 /* Calc FSB speed */
217 fsb = cpu_khz / current_multiplier;
218 /* Calc number of p-states supported */
219 if (brand == EPS_BRAND_C7M)
220 states = max_multiplier - min_multiplier + 1;
221 else
222 states = 2;
223
224 /* Allocate private data and frequency table for current cpu */
225 centaur = kzalloc(sizeof(struct eps_cpu_data)
226 + (states + 1) * sizeof(struct cpufreq_frequency_table),
227 GFP_KERNEL);
228 if (!centaur)
229 return -ENOMEM;
230 eps_cpu[0] = centaur;
231
232 /* Copy basic values */
233 centaur->fsb = fsb;
234
235 /* Fill frequency and MSR value table */
236 f_table = &centaur->freq_table[0];
237 if (brand != EPS_BRAND_C7M) {
238 f_table[0].frequency = fsb * min_multiplier;
239 f_table[0].index = (min_multiplier << 8) | min_voltage;
240 f_table[1].frequency = fsb * max_multiplier;
241 f_table[1].index = (max_multiplier << 8) | max_voltage;
242 f_table[2].frequency = CPUFREQ_TABLE_END;
243 } else {
244 k = 0;
245 step = ((max_voltage - min_voltage) * 256)
246 / (max_multiplier - min_multiplier);
247 for (i = min_multiplier; i <= max_multiplier; i++) {
248 voltage = (k * step) / 256 + min_voltage;
249 f_table[k].frequency = fsb * i;
250 f_table[k].index = (i << 8) | voltage;
251 k++;
252 }
253 f_table[k].frequency = CPUFREQ_TABLE_END;
254 }
255
256 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
257 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
258 policy->cur = fsb * current_multiplier;
259
260 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
261 if (ret) {
262 kfree(centaur);
263 return ret;
264 }
265
266 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
267 return 0;
268}
269
270static int eps_cpu_exit(struct cpufreq_policy *policy)
271{
272 unsigned int cpu = policy->cpu;
273 struct eps_cpu_data *centaur;
274 u32 lo, hi;
275
276 if (eps_cpu[cpu] == NULL)
277 return -ENODEV;
278 centaur = eps_cpu[cpu];
279
280 /* Get max frequency */
281 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
282 /* Set max frequency */
283 eps_set_state(centaur, cpu, hi & 0xffff);
284 /* Bye */
285 cpufreq_frequency_table_put_attr(policy->cpu);
286 kfree(eps_cpu[cpu]);
287 eps_cpu[cpu] = NULL;
288 return 0;
289}
290
291static struct freq_attr* eps_attr[] = {
292 &cpufreq_freq_attr_scaling_available_freqs,
293 NULL,
294};
295
296static struct cpufreq_driver eps_driver = {
297 .verify = eps_verify,
298 .target = eps_target,
299 .init = eps_cpu_init,
300 .exit = eps_cpu_exit,
301 .get = eps_get,
302 .name = "e_powersaver",
303 .owner = THIS_MODULE,
304 .attr = eps_attr,
305};
306
307static int __init eps_init(void)
308{
309 struct cpuinfo_x86 *c = cpu_data;
310
311 /* This driver will work only on Centaur C7 processors with
312 * Enhanced SpeedStep/PowerSaver registers */
313 if (c->x86_vendor != X86_VENDOR_CENTAUR
314 || c->x86 != 6 || c->x86_model != 10)
315 return -ENODEV;
316 if (!cpu_has(c, X86_FEATURE_EST))
317 return -ENODEV;
318
319 if (cpufreq_register_driver(&eps_driver))
320 return -EINVAL;
321 return 0;
322}
323
324static void __exit eps_exit(void)
325{
326 cpufreq_unregister_driver(&eps_driver);
327}
328
329MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>");
330MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
331MODULE_LICENSE("GPL");
332
333module_init(eps_init);
334module_exit(eps_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index a3db9332d652..b59878a0d9b3 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -8,12 +8,11 @@
8 * VIA have currently 3 different versions of Longhaul. 8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. 9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. 10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is the same as v1, but adds voltage scaling. 11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C) 12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * voltage scaling support has currently been disabled in this driver 13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * until we have code that gets it right.
15 * Version 3 of longhaul got renamed to Powersaver and redesigned 14 * Version 3 of longhaul got renamed to Powersaver and redesigned
16 * to use the POWERSAVER MSR at 0x110a. 15 * to use only the POWERSAVER MSR at 0x110a.
17 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. 16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
18 * It's pretty much the same feature wise to longhaul v2, though 17 * It's pretty much the same feature wise to longhaul v2, though
19 * there is provision for scaling FSB too, but this doesn't work 18 * there is provision for scaling FSB too, but this doesn't work
@@ -51,10 +50,12 @@
51#define CPU_EZRA 3 50#define CPU_EZRA 3
52#define CPU_EZRA_T 4 51#define CPU_EZRA_T 4
53#define CPU_NEHEMIAH 5 52#define CPU_NEHEMIAH 5
53#define CPU_NEHEMIAH_C 6
54 54
55/* Flags */ 55/* Flags */
56#define USE_ACPI_C3 (1 << 1) 56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2) 57#define USE_NORTHBRIDGE (1 << 2)
58#define USE_VT8235 (1 << 3)
58 59
59static int cpu_model; 60static int cpu_model;
60static unsigned int numscales=16; 61static unsigned int numscales=16;
@@ -63,7 +64,8 @@ static unsigned int fsb;
63static struct mV_pos *vrm_mV_table; 64static struct mV_pos *vrm_mV_table;
64static unsigned char *mV_vrm_table; 65static unsigned char *mV_vrm_table;
65struct f_msr { 66struct f_msr {
66 unsigned char vrm; 67 u8 vrm;
68 u8 pos;
67}; 69};
68static struct f_msr f_msr_table[32]; 70static struct f_msr f_msr_table[32];
69 71
@@ -73,10 +75,10 @@ static int can_scale_voltage;
73static struct acpi_processor *pr = NULL; 75static struct acpi_processor *pr = NULL;
74static struct acpi_processor_cx *cx = NULL; 76static struct acpi_processor_cx *cx = NULL;
75static u8 longhaul_flags; 77static u8 longhaul_flags;
78static u8 longhaul_pos;
76 79
77/* Module parameters */ 80/* Module parameters */
78static int scale_voltage; 81static int scale_voltage;
79static int ignore_latency;
80 82
81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) 83#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
82 84
@@ -164,26 +166,47 @@ static void do_longhaul1(unsigned int clock_ratio_index)
164static void do_powersaver(int cx_address, unsigned int clock_ratio_index) 166static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
165{ 167{
166 union msr_longhaul longhaul; 168 union msr_longhaul longhaul;
169 u8 dest_pos;
167 u32 t; 170 u32 t;
168 171
172 dest_pos = f_msr_table[clock_ratio_index].pos;
173
169 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); 174 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
175 /* Setup new frequency */
170 longhaul.bits.RevisionKey = longhaul.bits.RevisionID; 176 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
171 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; 177 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
172 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; 178 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
173 longhaul.bits.EnableSoftBusRatio = 1; 179 /* Setup new voltage */
174 180 if (can_scale_voltage)
175 if (can_scale_voltage) {
176 longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm; 181 longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm;
182 /* Sync to timer tick */
183 safe_halt();
184 /* Raise voltage if necessary */
185 if (can_scale_voltage && longhaul_pos < dest_pos) {
177 longhaul.bits.EnableSoftVID = 1; 186 longhaul.bits.EnableSoftVID = 1;
187 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
188 /* Change voltage */
189 if (!cx_address) {
190 ACPI_FLUSH_CPU_CACHE();
191 halt();
192 } else {
193 ACPI_FLUSH_CPU_CACHE();
194 /* Invoke C3 */
195 inb(cx_address);
196 /* Dummy op - must do something useless after P_LVL3
197 * read */
198 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
199 }
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
202 longhaul_pos = dest_pos;
178 } 203 }
179 204
180 /* Sync to timer tick */
181 safe_halt();
182 /* Change frequency on next halt or sleep */ 205 /* Change frequency on next halt or sleep */
206 longhaul.bits.EnableSoftBusRatio = 1;
183 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); 207 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
184 if (!cx_address) { 208 if (!cx_address) {
185 ACPI_FLUSH_CPU_CACHE(); 209 ACPI_FLUSH_CPU_CACHE();
186 /* Invoke C1 */
187 halt(); 210 halt();
188 } else { 211 } else {
189 ACPI_FLUSH_CPU_CACHE(); 212 ACPI_FLUSH_CPU_CACHE();
@@ -193,12 +216,29 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
193 t = inl(acpi_gbl_FADT.xpm_timer_block.address); 216 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
194 } 217 }
195 /* Disable bus ratio bit */ 218 /* Disable bus ratio bit */
196 local_irq_disable();
197 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
198 longhaul.bits.EnableSoftBusRatio = 0; 219 longhaul.bits.EnableSoftBusRatio = 0;
199 longhaul.bits.EnableSoftBSEL = 0;
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); 220 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
221
222 /* Reduce voltage if necessary */
223 if (can_scale_voltage && longhaul_pos > dest_pos) {
224 longhaul.bits.EnableSoftVID = 1;
225 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
226 /* Change voltage */
227 if (!cx_address) {
228 ACPI_FLUSH_CPU_CACHE();
229 halt();
230 } else {
231 ACPI_FLUSH_CPU_CACHE();
232 /* Invoke C3 */
233 inb(cx_address);
234 /* Dummy op - must do something useless after P_LVL3
235 * read */
236 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
237 }
238 longhaul.bits.EnableSoftVID = 0;
239 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
240 longhaul_pos = dest_pos;
241 }
202} 242}
203 243
204/** 244/**
@@ -257,26 +297,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
257 /* 297 /*
258 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) 298 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
259 * Software controlled multipliers only. 299 * Software controlled multipliers only.
260 *
261 * *NB* Until we get voltage scaling working v1 & v2 are the same code.
262 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C]
263 */ 300 */
264 case TYPE_LONGHAUL_V1: 301 case TYPE_LONGHAUL_V1:
265 case TYPE_LONGHAUL_V2:
266 do_longhaul1(clock_ratio_index); 302 do_longhaul1(clock_ratio_index);
267 break; 303 break;
268 304
269 /* 305 /*
306 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
307 *
270 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) 308 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
271 * We can scale voltage with this too, but that's currently
272 * disabled until we come up with a decent 'match freq to voltage'
273 * algorithm.
274 * When we add voltage scaling, we will also need to do the
275 * voltage/freq setting in order depending on the direction
276 * of scaling (like we do in powernow-k7.c)
277 * Nehemiah can do FSB scaling too, but this has never been proven 309 * Nehemiah can do FSB scaling too, but this has never been proven
278 * to work in practice. 310 * to work in practice.
279 */ 311 */
312 case TYPE_LONGHAUL_V2:
280 case TYPE_POWERSAVER: 313 case TYPE_POWERSAVER:
281 if (longhaul_flags & USE_ACPI_C3) { 314 if (longhaul_flags & USE_ACPI_C3) {
282 /* Don't allow wakeup */ 315 /* Don't allow wakeup */
@@ -301,6 +334,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
301 local_irq_restore(flags); 334 local_irq_restore(flags);
302 preempt_enable(); 335 preempt_enable();
303 336
337 freqs.new = calc_speed(longhaul_get_cpu_mult());
304 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 338 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
305} 339}
306 340
@@ -315,31 +349,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
315 349
316#define ROUNDING 0xf 350#define ROUNDING 0xf
317 351
318static int _guess(int guess, int mult)
319{
320 int target;
321
322 target = ((mult/10)*guess);
323 if (mult%10 != 0)
324 target += (guess/2);
325 target += ROUNDING/2;
326 target &= ~ROUNDING;
327 return target;
328}
329
330
331static int guess_fsb(int mult) 352static int guess_fsb(int mult)
332{ 353{
333 int speed = (cpu_khz/1000); 354 int speed = cpu_khz / 1000;
334 int i; 355 int i;
335 int speeds[] = { 66, 100, 133, 200 }; 356 int speeds[] = { 666, 1000, 1333, 2000 };
336 357 int f_max, f_min;
337 speed += ROUNDING/2; 358
338 speed &= ~ROUNDING; 359 for (i = 0; i < 4; i++) {
339 360 f_max = ((speeds[i] * mult) + 50) / 100;
340 for (i=0; i<4; i++) { 361 f_max += (ROUNDING / 2);
341 if (_guess(speeds[i], mult) == speed) 362 f_min = f_max - ROUNDING;
342 return speeds[i]; 363 if ((speed <= f_max) && (speed >= f_min))
364 return speeds[i] / 10;
343 } 365 }
344 return 0; 366 return 0;
345} 367}
@@ -347,67 +369,40 @@ static int guess_fsb(int mult)
347 369
348static int __init longhaul_get_ranges(void) 370static int __init longhaul_get_ranges(void)
349{ 371{
350 unsigned long invalue;
351 unsigned int ezra_t_multipliers[32]= {
352 90, 30, 40, 100, 55, 35, 45, 95,
353 50, 70, 80, 60, 120, 75, 85, 65,
354 -1, 110, 120, -1, 135, 115, 125, 105,
355 130, 150, 160, 140, -1, 155, -1, 145 };
356 unsigned int j, k = 0; 372 unsigned int j, k = 0;
357 union msr_longhaul longhaul; 373 int mult;
358 int mult = 0;
359 374
360 switch (longhaul_version) { 375 /* Get current frequency */
361 case TYPE_LONGHAUL_V1: 376 mult = longhaul_get_cpu_mult();
362 case TYPE_LONGHAUL_V2: 377 if (mult == -1) {
363 /* Ugh, Longhaul v1 didn't have the min/max MSRs. 378 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
364 Assume min=3.0x & max = whatever we booted at. */ 379 return -EINVAL;
380 }
381 fsb = guess_fsb(mult);
382 if (fsb == 0) {
383 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
384 return -EINVAL;
385 }
386 /* Get max multiplier - as we always did.
387 * Longhaul MSR is usefull only when voltage scaling is enabled.
388 * C3 is booting at max anyway. */
389 maxmult = mult;
390 /* Get min multiplier */
391 switch (cpu_model) {
392 case CPU_NEHEMIAH:
393 minmult = 50;
394 break;
395 case CPU_NEHEMIAH_C:
396 minmult = 40;
397 break;
398 default:
365 minmult = 30; 399 minmult = 30;
366 maxmult = mult = longhaul_get_cpu_mult();
367 break; 400 break;
368
369 case TYPE_POWERSAVER:
370 /* Ezra-T */
371 if (cpu_model==CPU_EZRA_T) {
372 minmult = 30;
373 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
374 invalue = longhaul.bits.MaxMHzBR;
375 if (longhaul.bits.MaxMHzBR4)
376 invalue += 16;
377 maxmult = mult = ezra_t_multipliers[invalue];
378 break;
379 }
380
381 /* Nehemiah */
382 if (cpu_model==CPU_NEHEMIAH) {
383 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
384
385 /*
386 * TODO: This code works, but raises a lot of questions.
387 * - Some Nehemiah's seem to have broken Min/MaxMHzBR's.
388 * We get around this by using a hardcoded multiplier of 4.0x
389 * for the minimimum speed, and the speed we booted up at for the max.
390 * This is done in longhaul_get_cpu_mult() by reading the EBLCR register.
391 * - According to some VIA documentation EBLCR is only
392 * in pre-Nehemiah C3s. How this still works is a mystery.
393 * We're possibly using something undocumented and unsupported,
394 * But it works, so we don't grumble.
395 */
396 minmult=40;
397 maxmult = mult = longhaul_get_cpu_mult();
398 break;
399 }
400 } 401 }
401 fsb = guess_fsb(mult);
402 402
403 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", 403 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
404 minmult/10, minmult%10, maxmult/10, maxmult%10); 404 minmult/10, minmult%10, maxmult/10, maxmult%10);
405 405
406 if (fsb == 0) {
407 printk (KERN_INFO PFX "Invalid (reserved) FSB!\n");
408 return -EINVAL;
409 }
410
411 highest_speed = calc_speed(maxmult); 406 highest_speed = calc_speed(maxmult);
412 lowest_speed = calc_speed(minmult); 407 lowest_speed = calc_speed(minmult);
413 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, 408 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
@@ -455,6 +450,7 @@ static void __init longhaul_setup_voltagescaling(void)
455 union msr_longhaul longhaul; 450 union msr_longhaul longhaul;
456 struct mV_pos minvid, maxvid; 451 struct mV_pos minvid, maxvid;
457 unsigned int j, speed, pos, kHz_step, numvscales; 452 unsigned int j, speed, pos, kHz_step, numvscales;
453 int min_vid_speed;
458 454
459 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); 455 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
460 if (!(longhaul.bits.RevisionID & 1)) { 456 if (!(longhaul.bits.RevisionID & 1)) {
@@ -468,14 +464,14 @@ static void __init longhaul_setup_voltagescaling(void)
468 mV_vrm_table = &mV_vrm85[0]; 464 mV_vrm_table = &mV_vrm85[0];
469 } else { 465 } else {
470 printk (KERN_INFO PFX "Mobile VRM\n"); 466 printk (KERN_INFO PFX "Mobile VRM\n");
467 if (cpu_model < CPU_NEHEMIAH)
468 return;
471 vrm_mV_table = &mobilevrm_mV[0]; 469 vrm_mV_table = &mobilevrm_mV[0];
472 mV_vrm_table = &mV_mobilevrm[0]; 470 mV_vrm_table = &mV_mobilevrm[0];
473 } 471 }
474 472
475 minvid = vrm_mV_table[longhaul.bits.MinimumVID]; 473 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
476 maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; 474 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
477 numvscales = maxvid.pos - minvid.pos + 1;
478 kHz_step = (highest_speed - lowest_speed) / numvscales;
479 475
480 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { 476 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
481 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " 477 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
@@ -491,20 +487,59 @@ static void __init longhaul_setup_voltagescaling(void)
491 return; 487 return;
492 } 488 }
493 489
494 printk(KERN_INFO PFX "Max VID=%d.%03d Min VID=%d.%03d, %d possible voltage scales\n", 490 /* How many voltage steps */
491 numvscales = maxvid.pos - minvid.pos + 1;
492 printk(KERN_INFO PFX
493 "Max VID=%d.%03d "
494 "Min VID=%d.%03d, "
495 "%d possible voltage scales\n",
495 maxvid.mV/1000, maxvid.mV%1000, 496 maxvid.mV/1000, maxvid.mV%1000,
496 minvid.mV/1000, minvid.mV%1000, 497 minvid.mV/1000, minvid.mV%1000,
497 numvscales); 498 numvscales);
498 499
500 /* Calculate max frequency at min voltage */
501 j = longhaul.bits.MinMHzBR;
502 if (longhaul.bits.MinMHzBR4)
503 j += 16;
504 min_vid_speed = eblcr_table[j];
505 if (min_vid_speed == -1)
506 return;
507 switch (longhaul.bits.MinMHzFSB) {
508 case 0:
509 min_vid_speed *= 13333;
510 break;
511 case 1:
512 min_vid_speed *= 10000;
513 break;
514 case 3:
515 min_vid_speed *= 6666;
516 break;
517 default:
518 return;
519 break;
520 }
521 if (min_vid_speed >= highest_speed)
522 return;
523 /* Calculate kHz for one voltage step */
524 kHz_step = (highest_speed - min_vid_speed) / numvscales;
525
526
499 j = 0; 527 j = 0;
500 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { 528 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
501 speed = longhaul_table[j].frequency; 529 speed = longhaul_table[j].frequency;
502 pos = (speed - lowest_speed) / kHz_step + minvid.pos; 530 if (speed > min_vid_speed)
531 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
532 else
533 pos = minvid.pos;
503 f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos]; 534 f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos];
535 f_msr_table[longhaul_table[j].index].pos = pos;
504 j++; 536 j++;
505 } 537 }
506 538
539 longhaul_pos = maxvid.pos;
507 can_scale_voltage = 1; 540 can_scale_voltage = 1;
541 printk(KERN_INFO PFX "Voltage scaling enabled. "
542 "Use of \"conservative\" governor is highly recommended.\n");
508} 543}
509 544
510 545
@@ -573,20 +608,51 @@ static int enable_arbiter_disable(void)
573 if (dev != NULL) { 608 if (dev != NULL) {
574 /* Enable access to port 0x22 */ 609 /* Enable access to port 0x22 */
575 pci_read_config_byte(dev, reg, &pci_cmd); 610 pci_read_config_byte(dev, reg, &pci_cmd);
576 if ( !(pci_cmd & 1<<7) ) { 611 if (!(pci_cmd & 1<<7)) {
577 pci_cmd |= 1<<7; 612 pci_cmd |= 1<<7;
578 pci_write_config_byte(dev, reg, pci_cmd); 613 pci_write_config_byte(dev, reg, pci_cmd);
614 pci_read_config_byte(dev, reg, &pci_cmd);
615 if (!(pci_cmd & 1<<7)) {
616 printk(KERN_ERR PFX
617 "Can't enable access to port 0x22.\n");
618 return 0;
619 }
579 } 620 }
580 return 1; 621 return 1;
581 } 622 }
582 return 0; 623 return 0;
583} 624}
584 625
626static int longhaul_setup_vt8235(void)
627{
628 struct pci_dev *dev;
629 u8 pci_cmd;
630
631 /* Find VT8235 southbridge */
632 dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
633 if (dev != NULL) {
634 /* Set transition time to max */
635 pci_read_config_byte(dev, 0xec, &pci_cmd);
636 pci_cmd &= ~(1 << 2);
637 pci_write_config_byte(dev, 0xec, pci_cmd);
638 pci_read_config_byte(dev, 0xe4, &pci_cmd);
639 pci_cmd &= ~(1 << 7);
640 pci_write_config_byte(dev, 0xe4, pci_cmd);
641 pci_read_config_byte(dev, 0xe5, &pci_cmd);
642 pci_cmd |= 1 << 7;
643 pci_write_config_byte(dev, 0xe5, pci_cmd);
644 return 1;
645 }
646 return 0;
647}
648
585static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 649static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
586{ 650{
587 struct cpuinfo_x86 *c = cpu_data; 651 struct cpuinfo_x86 *c = cpu_data;
588 char *cpuname=NULL; 652 char *cpuname=NULL;
589 int ret; 653 int ret;
654 u32 lo, hi;
655 int vt8235_present;
590 656
591 /* Check what we have on this motherboard */ 657 /* Check what we have on this motherboard */
592 switch (c->x86_model) { 658 switch (c->x86_model) {
@@ -599,16 +665,20 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
599 break; 665 break;
600 666
601 case 7: 667 case 7:
602 longhaul_version = TYPE_LONGHAUL_V1;
603 switch (c->x86_mask) { 668 switch (c->x86_mask) {
604 case 0: 669 case 0:
670 longhaul_version = TYPE_LONGHAUL_V1;
605 cpu_model = CPU_SAMUEL2; 671 cpu_model = CPU_SAMUEL2;
606 cpuname = "C3 'Samuel 2' [C5B]"; 672 cpuname = "C3 'Samuel 2' [C5B]";
607 /* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */ 673 /* Note, this is not a typo, early Samuel2's had
608 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); 674 * Samuel1 ratios. */
609 memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr)); 675 memcpy(clock_ratio, samuel1_clock_ratio,
676 sizeof(samuel1_clock_ratio));
677 memcpy(eblcr_table, samuel2_eblcr,
678 sizeof(samuel2_eblcr));
610 break; 679 break;
611 case 1 ... 15: 680 case 1 ... 15:
681 longhaul_version = TYPE_LONGHAUL_V2;
612 if (c->x86_mask < 8) { 682 if (c->x86_mask < 8) {
613 cpu_model = CPU_SAMUEL2; 683 cpu_model = CPU_SAMUEL2;
614 cpuname = "C3 'Samuel 2' [C5B]"; 684 cpuname = "C3 'Samuel 2' [C5B]";
@@ -616,8 +686,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
616 cpu_model = CPU_EZRA; 686 cpu_model = CPU_EZRA;
617 cpuname = "C3 'Ezra' [C5C]"; 687 cpuname = "C3 'Ezra' [C5C]";
618 } 688 }
619 memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio)); 689 memcpy(clock_ratio, ezra_clock_ratio,
620 memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr)); 690 sizeof(ezra_clock_ratio));
691 memcpy(eblcr_table, ezra_eblcr,
692 sizeof(ezra_eblcr));
621 break; 693 break;
622 } 694 }
623 break; 695 break;
@@ -632,24 +704,24 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
632 break; 704 break;
633 705
634 case 9: 706 case 9:
635 cpu_model = CPU_NEHEMIAH;
636 longhaul_version = TYPE_POWERSAVER; 707 longhaul_version = TYPE_POWERSAVER;
637 numscales=32; 708 numscales = 32;
709 memcpy(clock_ratio,
710 nehemiah_clock_ratio,
711 sizeof(nehemiah_clock_ratio));
712 memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
638 switch (c->x86_mask) { 713 switch (c->x86_mask) {
639 case 0 ... 1: 714 case 0 ... 1:
640 cpuname = "C3 'Nehemiah A' [C5N]"; 715 cpu_model = CPU_NEHEMIAH;
641 memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio)); 716 cpuname = "C3 'Nehemiah A' [C5XLOE]";
642 memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr));
643 break; 717 break;
644 case 2 ... 4: 718 case 2 ... 4:
645 cpuname = "C3 'Nehemiah B' [C5N]"; 719 cpu_model = CPU_NEHEMIAH;
646 memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio)); 720 cpuname = "C3 'Nehemiah B' [C5XLOH]";
647 memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr));
648 break; 721 break;
649 case 5 ... 15: 722 case 5 ... 15:
650 cpuname = "C3 'Nehemiah C' [C5N]"; 723 cpu_model = CPU_NEHEMIAH_C;
651 memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio)); 724 cpuname = "C3 'Nehemiah C' [C5P]";
652 memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr));
653 break; 725 break;
654 } 726 }
655 break; 727 break;
@@ -658,6 +730,13 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
658 cpuname = "Unknown"; 730 cpuname = "Unknown";
659 break; 731 break;
660 } 732 }
733 /* Check Longhaul ver. 2 */
734 if (longhaul_version == TYPE_LONGHAUL_V2) {
735 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
736 if (lo == 0 && hi == 0)
737 /* Looks like MSR isn't present */
738 longhaul_version = TYPE_LONGHAUL_V1;
739 }
661 740
662 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); 741 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
663 switch (longhaul_version) { 742 switch (longhaul_version) {
@@ -670,15 +749,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
670 break; 749 break;
671 }; 750 };
672 751
752 /* Doesn't hurt */
753 vt8235_present = longhaul_setup_vt8235();
754
673 /* Find ACPI data for processor */ 755 /* Find ACPI data for processor */
674 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, 756 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
675 &longhaul_walk_callback, NULL, (void *)&pr); 757 ACPI_UINT32_MAX, &longhaul_walk_callback,
758 NULL, (void *)&pr);
676 759
677 /* Check ACPI support for C3 state */ 760 /* Check ACPI support for C3 state */
678 if ((pr != NULL) && (longhaul_version == TYPE_POWERSAVER)) { 761 if (pr != NULL && longhaul_version != TYPE_LONGHAUL_V1) {
679 cx = &pr->power.states[ACPI_STATE_C3]; 762 cx = &pr->power.states[ACPI_STATE_C3];
680 if (cx->address > 0 && 763 if (cx->address > 0 && cx->latency <= 1000) {
681 (cx->latency <= 1000 || ignore_latency != 0) ) {
682 longhaul_flags |= USE_ACPI_C3; 764 longhaul_flags |= USE_ACPI_C3;
683 goto print_support_type; 765 goto print_support_type;
684 } 766 }
@@ -688,8 +770,11 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
688 longhaul_flags |= USE_NORTHBRIDGE; 770 longhaul_flags |= USE_NORTHBRIDGE;
689 goto print_support_type; 771 goto print_support_type;
690 } 772 }
691 773 /* Use VT8235 southbridge if present */
692 /* No ACPI C3 or we can't use it */ 774 if (longhaul_version == TYPE_POWERSAVER && vt8235_present) {
775 longhaul_flags |= USE_VT8235;
776 goto print_support_type;
777 }
693 /* Check ACPI support for bus master arbiter disable */ 778 /* Check ACPI support for bus master arbiter disable */
694 if ((pr == NULL) || !(pr->flags.bm_control)) { 779 if ((pr == NULL) || !(pr->flags.bm_control)) {
695 printk(KERN_ERR PFX 780 printk(KERN_ERR PFX
@@ -698,18 +783,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
698 } 783 }
699 784
700print_support_type: 785print_support_type:
701 if (!(longhaul_flags & USE_NORTHBRIDGE)) { 786 if (longhaul_flags & USE_NORTHBRIDGE)
702 printk (KERN_INFO PFX "Using ACPI support.\n");
703 } else {
704 printk (KERN_INFO PFX "Using northbridge support.\n"); 787 printk (KERN_INFO PFX "Using northbridge support.\n");
705 } 788 else if (longhaul_flags & USE_VT8235)
789 printk (KERN_INFO PFX "Using VT8235 support.\n");
790 else
791 printk (KERN_INFO PFX "Using ACPI support.\n");
706 792
707 ret = longhaul_get_ranges(); 793 ret = longhaul_get_ranges();
708 if (ret != 0) 794 if (ret != 0)
709 return ret; 795 return ret;
710 796
711 if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) && 797 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
712 (scale_voltage != 0))
713 longhaul_setup_voltagescaling(); 798 longhaul_setup_voltagescaling();
714 799
715 policy->governor = CPUFREQ_DEFAULT_GOVERNOR; 800 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
@@ -797,8 +882,6 @@ static void __exit longhaul_exit(void)
797 882
798module_param (scale_voltage, int, 0644); 883module_param (scale_voltage, int, 0644);
799MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 884MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
800module_param(ignore_latency, int, 0644);
801MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test");
802 885
803MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 886MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
804MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 887MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h
index bc4682aad69b..bb0a04b1d1ab 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h
@@ -235,84 +235,14 @@ static int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static int __initdata nehemiah_a_clock_ratio[32] = { 238static int __initdata nehemiah_clock_ratio[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 160, /* 0001 -> 16.0x */ 240 160, /* 0001 -> 16.0x */
241 -1, /* 0010 -> RESERVED */ 241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 -1, /* 0110 -> RESERVED */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 100, /* 0000 -> 10.0x */
256 -1, /* 0001 -> RESERVED */
257 120, /* 0010 -> 12.0x */
258 90, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 120, /* 1111 -> 12.0x */
271};
272
273static int __initdata nehemiah_b_clock_ratio[32] = {
274 100, /* 0000 -> 10.0x */
275 160, /* 0001 -> 16.0x */
276 -1, /* 0010 -> RESERVED */
277 90, /* 0011 -> 9.0x */
278 95, /* 0100 -> 9.5x */
279 -1, /* 0101 -> RESERVED */
280 -1, /* 0110 -> RESERVED */
281 55, /* 0111 -> 5.5x */
282 60, /* 1000 -> 6.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 50, /* 1011 -> 5.0x */
286 65, /* 1100 -> 6.5x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 120, /* 1111 -> 12.0x */
290 100, /* 0000 -> 10.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 90, /* 0011 -> 9.0x */
294 105, /* 0100 -> 10.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 135, /* 0111 -> 13.5x */
298 140, /* 1000 -> 14.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 130, /* 1011 -> 13.0x */
302 145, /* 1100 -> 14.5x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 120, /* 1111 -> 12.0x */
306};
307
308static int __initdata nehemiah_c_clock_ratio[32] = {
309 100, /* 0000 -> 10.0x */
310 160, /* 0001 -> 16.0x */
311 40, /* 0010 -> RESERVED */
312 90, /* 0011 -> 9.0x */ 242 90, /* 0011 -> 9.0x */
313 95, /* 0100 -> 9.5x */ 243 95, /* 0100 -> 9.5x */
314 -1, /* 0101 -> RESERVED */ 244 -1, /* 0101 -> RESERVED */
315 45, /* 0110 -> RESERVED */ 245 45, /* 0110 -> 4.5x */
316 55, /* 0111 -> 5.5x */ 246 55, /* 0111 -> 5.5x */
317 60, /* 1000 -> 6.0x */ 247 60, /* 1000 -> 6.0x */
318 70, /* 1001 -> 7.0x */ 248 70, /* 1001 -> 7.0x */
@@ -340,84 +270,14 @@ static int __initdata nehemiah_c_clock_ratio[32] = {
340 120, /* 1111 -> 12.0x */ 270 120, /* 1111 -> 12.0x */
341}; 271};
342 272
343static int __initdata nehemiah_a_eblcr[32] = { 273static int __initdata nehemiah_eblcr[32] = {
344 50, /* 0000 -> 5.0x */
345 160, /* 0001 -> 16.0x */
346 -1, /* 0010 -> RESERVED */
347 100, /* 0011 -> 10.0x */
348 55, /* 0100 -> 5.5x */
349 -1, /* 0101 -> RESERVED */
350 -1, /* 0110 -> RESERVED */
351 95, /* 0111 -> 9.5x */
352 90, /* 1000 -> 9.0x */
353 70, /* 1001 -> 7.0x */
354 80, /* 1010 -> 8.0x */
355 60, /* 1011 -> 6.0x */
356 120, /* 1100 -> 12.0x */
357 75, /* 1101 -> 7.5x */
358 85, /* 1110 -> 8.5x */
359 65, /* 1111 -> 6.5x */
360 90, /* 0000 -> 9.0x */
361 -1, /* 0001 -> RESERVED */
362 120, /* 0010 -> 12.0x */
363 100, /* 0011 -> 10.0x */
364 135, /* 0100 -> 13.5x */
365 115, /* 0101 -> 11.5x */
366 125, /* 0110 -> 12.5x */
367 105, /* 0111 -> 10.5x */
368 130, /* 1000 -> 13.0x */
369 150, /* 1001 -> 15.0x */
370 160, /* 1010 -> 16.0x */
371 140, /* 1011 -> 14.0x */
372 120, /* 1100 -> 12.0x */
373 155, /* 1101 -> 15.5x */
374 -1, /* 1110 -> RESERVED (13.0x) */
375 145 /* 1111 -> 14.5x */
376 /* end of table */
377};
378static int __initdata nehemiah_b_eblcr[32] = {
379 50, /* 0000 -> 5.0x */
380 160, /* 0001 -> 16.0x */
381 -1, /* 0010 -> RESERVED */
382 100, /* 0011 -> 10.0x */
383 55, /* 0100 -> 5.5x */
384 -1, /* 0101 -> RESERVED */
385 -1, /* 0110 -> RESERVED */
386 95, /* 0111 -> 9.5x */
387 90, /* 1000 -> 9.0x */
388 70, /* 1001 -> 7.0x */
389 80, /* 1010 -> 8.0x */
390 60, /* 1011 -> 6.0x */
391 120, /* 1100 -> 12.0x */
392 75, /* 1101 -> 7.5x */
393 85, /* 1110 -> 8.5x */
394 65, /* 1111 -> 6.5x */
395 90, /* 0000 -> 9.0x */
396 110, /* 0001 -> 11.0x */
397 120, /* 0010 -> 12.0x */
398 100, /* 0011 -> 10.0x */
399 135, /* 0100 -> 13.5x */
400 115, /* 0101 -> 11.5x */
401 125, /* 0110 -> 12.5x */
402 105, /* 0111 -> 10.5x */
403 130, /* 1000 -> 13.0x */
404 150, /* 1001 -> 15.0x */
405 160, /* 1010 -> 16.0x */
406 140, /* 1011 -> 14.0x */
407 120, /* 1100 -> 12.0x */
408 155, /* 1101 -> 15.5x */
409 -1, /* 1110 -> RESERVED (13.0x) */
410 145 /* 1111 -> 14.5x */
411 /* end of table */
412};
413static int __initdata nehemiah_c_eblcr[32] = {
414 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
415 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
416 40, /* 0010 -> RESERVED */ 276 40, /* 0010 -> 4.0x */
417 100, /* 0011 -> 10.0x */ 277 100, /* 0011 -> 10.0x */
418 55, /* 0100 -> 5.5x */ 278 55, /* 0100 -> 5.5x */
419 -1, /* 0101 -> RESERVED */ 279 -1, /* 0101 -> RESERVED */
420 45, /* 0110 -> RESERVED */ 280 45, /* 0110 -> 4.5x */
421 95, /* 0111 -> 9.5x */ 281 95, /* 0111 -> 9.5x */
422 90, /* 1000 -> 9.0x */ 282 90, /* 1000 -> 9.0x */
423 70, /* 1001 -> 7.0x */ 283 70, /* 1001 -> 7.0x */
@@ -443,7 +303,6 @@ static int __initdata nehemiah_c_eblcr[32] = {
443 155, /* 1101 -> 15.5x */ 303 155, /* 1101 -> 15.5x */
444 -1, /* 1110 -> RESERVED (13.0x) */ 304 -1, /* 1110 -> RESERVED (13.0x) */
445 145 /* 1111 -> 14.5x */ 305 145 /* 1111 -> 14.5x */
446 /* end of table */
447}; 306};
448 307
449/* 308/*
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 2d6491672559..fe3b67005ebb 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -1289,7 +1289,11 @@ static unsigned int powernowk8_get (unsigned int cpu)
1289 if (query_current_values_with_pending_wait(data)) 1289 if (query_current_values_with_pending_wait(data))
1290 goto out; 1290 goto out;
1291 1291
1292 khz = find_khz_freq_from_fid(data->currfid); 1292 if (cpu_family == CPU_HW_PSTATE)
1293 khz = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1294 else
1295 khz = find_khz_freq_from_fid(data->currfid);
1296
1293 1297
1294out: 1298out:
1295 set_cpus_allowed(current, oldmask); 1299 set_cpus_allowed(current, oldmask);
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 0b29d41322a2..e1006b7acc9e 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -1,4 +1,5 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h>
2#include <linux/errno.h> 3#include <linux/errno.h>
3#include <linux/hpet.h> 4#include <linux/hpet.h>
4#include <linux/init.h> 5#include <linux/init.h>
@@ -6,17 +7,278 @@
6#include <asm/hpet.h> 7#include <asm/hpet.h>
7#include <asm/io.h> 8#include <asm/io.h>
8 9
10extern struct clock_event_device *global_clock_event;
11
9#define HPET_MASK CLOCKSOURCE_MASK(32) 12#define HPET_MASK CLOCKSOURCE_MASK(32)
10#define HPET_SHIFT 22 13#define HPET_SHIFT 22
11 14
12/* FSEC = 10^-15 NSEC = 10^-9 */ 15/* FSEC = 10^-15 NSEC = 10^-9 */
13#define FSEC_PER_NSEC 1000000 16#define FSEC_PER_NSEC 1000000
14 17
15static void __iomem *hpet_ptr; 18/*
19 * HPET address is set in acpi/boot.c, when an ACPI entry exists
20 */
21unsigned long hpet_address;
22static void __iomem * hpet_virt_address;
23
24static inline unsigned long hpet_readl(unsigned long a)
25{
26 return readl(hpet_virt_address + a);
27}
28
29static inline void hpet_writel(unsigned long d, unsigned long a)
30{
31 writel(d, hpet_virt_address + a);
32}
33
34/*
35 * HPET command line enable / disable
36 */
37static int boot_hpet_disable;
38
39static int __init hpet_setup(char* str)
40{
41 if (str) {
42 if (!strncmp("disable", str, 7))
43 boot_hpet_disable = 1;
44 }
45 return 1;
46}
47__setup("hpet=", hpet_setup);
48
49static inline int is_hpet_capable(void)
50{
51 return (!boot_hpet_disable && hpet_address);
52}
53
54/*
55 * HPET timer interrupt enable / disable
56 */
57static int hpet_legacy_int_enabled;
58
59/**
60 * is_hpet_enabled - check whether the hpet timer interrupt is enabled
61 */
62int is_hpet_enabled(void)
63{
64 return is_hpet_capable() && hpet_legacy_int_enabled;
65}
66
67/*
68 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
69 * timer 0 and timer 1 in case of RTC emulation.
70 */
71#ifdef CONFIG_HPET
72static void hpet_reserve_platform_timers(unsigned long id)
73{
74 struct hpet __iomem *hpet = hpet_virt_address;
75 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
76 unsigned int nrtimers, i;
77 struct hpet_data hd;
78
79 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
80
81 memset(&hd, 0, sizeof (hd));
82 hd.hd_phys_address = hpet_address;
83 hd.hd_address = hpet_virt_address;
84 hd.hd_nirqs = nrtimers;
85 hd.hd_flags = HPET_DATA_PLATFORM;
86 hpet_reserve_timer(&hd, 0);
87
88#ifdef CONFIG_HPET_EMULATE_RTC
89 hpet_reserve_timer(&hd, 1);
90#endif
91
92 hd.hd_irq[0] = HPET_LEGACY_8254;
93 hd.hd_irq[1] = HPET_LEGACY_RTC;
94
95 for (i = 2; i < nrtimers; timer++, i++)
96 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
97 Tn_INT_ROUTE_CNF_SHIFT;
98
99 hpet_alloc(&hd);
100
101}
102#else
103static void hpet_reserve_platform_timers(unsigned long id) { }
104#endif
105
106/*
107 * Common hpet info
108 */
109static unsigned long hpet_period;
110
111static void hpet_set_mode(enum clock_event_mode mode,
112 struct clock_event_device *evt);
113static int hpet_next_event(unsigned long delta,
114 struct clock_event_device *evt);
115
116/*
117 * The hpet clock event device
118 */
119static struct clock_event_device hpet_clockevent = {
120 .name = "hpet",
121 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
122 .set_mode = hpet_set_mode,
123 .set_next_event = hpet_next_event,
124 .shift = 32,
125 .irq = 0,
126};
127
128static void hpet_start_counter(void)
129{
130 unsigned long cfg = hpet_readl(HPET_CFG);
131
132 cfg &= ~HPET_CFG_ENABLE;
133 hpet_writel(cfg, HPET_CFG);
134 hpet_writel(0, HPET_COUNTER);
135 hpet_writel(0, HPET_COUNTER + 4);
136 cfg |= HPET_CFG_ENABLE;
137 hpet_writel(cfg, HPET_CFG);
138}
139
140static void hpet_enable_int(void)
141{
142 unsigned long cfg = hpet_readl(HPET_CFG);
143
144 cfg |= HPET_CFG_LEGACY;
145 hpet_writel(cfg, HPET_CFG);
146 hpet_legacy_int_enabled = 1;
147}
148
149static void hpet_set_mode(enum clock_event_mode mode,
150 struct clock_event_device *evt)
151{
152 unsigned long cfg, cmp, now;
153 uint64_t delta;
154
155 switch(mode) {
156 case CLOCK_EVT_MODE_PERIODIC:
157 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
158 delta >>= hpet_clockevent.shift;
159 now = hpet_readl(HPET_COUNTER);
160 cmp = now + (unsigned long) delta;
161 cfg = hpet_readl(HPET_T0_CFG);
162 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
163 HPET_TN_SETVAL | HPET_TN_32BIT;
164 hpet_writel(cfg, HPET_T0_CFG);
165 /*
166 * The first write after writing TN_SETVAL to the
167 * config register sets the counter value, the second
168 * write sets the period.
169 */
170 hpet_writel(cmp, HPET_T0_CMP);
171 udelay(1);
172 hpet_writel((unsigned long) delta, HPET_T0_CMP);
173 break;
174
175 case CLOCK_EVT_MODE_ONESHOT:
176 cfg = hpet_readl(HPET_T0_CFG);
177 cfg &= ~HPET_TN_PERIODIC;
178 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
179 hpet_writel(cfg, HPET_T0_CFG);
180 break;
181
182 case CLOCK_EVT_MODE_UNUSED:
183 case CLOCK_EVT_MODE_SHUTDOWN:
184 cfg = hpet_readl(HPET_T0_CFG);
185 cfg &= ~HPET_TN_ENABLE;
186 hpet_writel(cfg, HPET_T0_CFG);
187 break;
188 }
189}
190
191static int hpet_next_event(unsigned long delta,
192 struct clock_event_device *evt)
193{
194 unsigned long cnt;
195
196 cnt = hpet_readl(HPET_COUNTER);
197 cnt += delta;
198 hpet_writel(cnt, HPET_T0_CMP);
199
200 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0);
201}
202
203/*
204 * Try to setup the HPET timer
205 */
206int __init hpet_enable(void)
207{
208 unsigned long id;
209 uint64_t hpet_freq;
210
211 if (!is_hpet_capable())
212 return 0;
213
214 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
215
216 /*
217 * Read the period and check for a sane value:
218 */
219 hpet_period = hpet_readl(HPET_PERIOD);
220 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
221 goto out_nohpet;
222
223 /*
224 * The period is a femto seconds value. We need to calculate the
225 * scaled math multiplication factor for nanosecond to hpet tick
226 * conversion.
227 */
228 hpet_freq = 1000000000000000ULL;
229 do_div(hpet_freq, hpet_period);
230 hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
231 NSEC_PER_SEC, 32);
232 /* Calculate the min / max delta */
233 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
234 &hpet_clockevent);
235 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
236 &hpet_clockevent);
237
238 /*
239 * Read the HPET ID register to retrieve the IRQ routing
240 * information and the number of channels
241 */
242 id = hpet_readl(HPET_ID);
243
244#ifdef CONFIG_HPET_EMULATE_RTC
245 /*
246 * The legacy routing mode needs at least two channels, tick timer
247 * and the rtc emulation channel.
248 */
249 if (!(id & HPET_ID_NUMBER))
250 goto out_nohpet;
251#endif
252
253 /* Start the counter */
254 hpet_start_counter();
255
256 if (id & HPET_ID_LEGSUP) {
257 hpet_enable_int();
258 hpet_reserve_platform_timers(id);
259 /*
260 * Start hpet with the boot cpu mask and make it
261 * global after the IO_APIC has been initialized.
262 */
263 hpet_clockevent.cpumask =cpumask_of_cpu(0);
264 clockevents_register_device(&hpet_clockevent);
265 global_clock_event = &hpet_clockevent;
266 return 1;
267 }
268 return 0;
16 269
270out_nohpet:
271 iounmap(hpet_virt_address);
272 hpet_virt_address = NULL;
273 return 0;
274}
275
276/*
277 * Clock source related code
278 */
17static cycle_t read_hpet(void) 279static cycle_t read_hpet(void)
18{ 280{
19 return (cycle_t)readl(hpet_ptr); 281 return (cycle_t)hpet_readl(HPET_COUNTER);
20} 282}
21 283
22static struct clocksource clocksource_hpet = { 284static struct clocksource clocksource_hpet = {
@@ -24,28 +286,17 @@ static struct clocksource clocksource_hpet = {
24 .rating = 250, 286 .rating = 250,
25 .read = read_hpet, 287 .read = read_hpet,
26 .mask = HPET_MASK, 288 .mask = HPET_MASK,
27 .mult = 0, /* set below */
28 .shift = HPET_SHIFT, 289 .shift = HPET_SHIFT,
29 .is_continuous = 1, 290 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
30}; 291};
31 292
32static int __init init_hpet_clocksource(void) 293static int __init init_hpet_clocksource(void)
33{ 294{
34 unsigned long hpet_period;
35 void __iomem* hpet_base;
36 u64 tmp; 295 u64 tmp;
37 int err;
38 296
39 if (!is_hpet_enabled()) 297 if (!hpet_virt_address)
40 return -ENODEV; 298 return -ENODEV;
41 299
42 /* calculate the hpet address: */
43 hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
44 hpet_ptr = hpet_base + HPET_COUNTER;
45
46 /* calculate the frequency: */
47 hpet_period = readl(hpet_base + HPET_PERIOD);
48
49 /* 300 /*
50 * hpet period is in femto seconds per cycle 301 * hpet period is in femto seconds per cycle
51 * so we need to convert this to ns/cyc units 302 * so we need to convert this to ns/cyc units
@@ -61,11 +312,218 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 312 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 313 clocksource_hpet.mult = (u32)tmp;
63 314
64 err = clocksource_register(&clocksource_hpet); 315 return clocksource_register(&clocksource_hpet);
65 if (err)
66 iounmap(hpet_base);
67
68 return err;
69} 316}
70 317
71module_init(init_hpet_clocksource); 318module_init(init_hpet_clocksource);
319
320#ifdef CONFIG_HPET_EMULATE_RTC
321
322/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
323 * is enabled, we support RTC interrupt functionality in software.
324 * RTC has 3 kinds of interrupts:
325 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
326 * is updated
327 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
328 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
329 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
330 * (1) and (2) above are implemented using polling at a frequency of
331 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
332 * overhead. (DEFAULT_RTC_INT_FREQ)
333 * For (3), we use interrupts at 64Hz or user specified periodic
334 * frequency, whichever is higher.
335 */
336#include <linux/mc146818rtc.h>
337#include <linux/rtc.h>
338
339#define DEFAULT_RTC_INT_FREQ 64
340#define DEFAULT_RTC_SHIFT 6
341#define RTC_NUM_INTS 1
342
343static unsigned long hpet_rtc_flags;
344static unsigned long hpet_prev_update_sec;
345static struct rtc_time hpet_alarm_time;
346static unsigned long hpet_pie_count;
347static unsigned long hpet_t1_cmp;
348static unsigned long hpet_default_delta;
349static unsigned long hpet_pie_delta;
350static unsigned long hpet_pie_limit;
351
352/*
353 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
354 * is not supported by all HPET implementations for timer 1.
355 *
356 * hpet_rtc_timer_init() is called when the rtc is initialized.
357 */
358int hpet_rtc_timer_init(void)
359{
360 unsigned long cfg, cnt, delta, flags;
361
362 if (!is_hpet_enabled())
363 return 0;
364
365 if (!hpet_default_delta) {
366 uint64_t clc;
367
368 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
369 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
370 hpet_default_delta = (unsigned long) clc;
371 }
372
373 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
374 delta = hpet_default_delta;
375 else
376 delta = hpet_pie_delta;
377
378 local_irq_save(flags);
379
380 cnt = delta + hpet_readl(HPET_COUNTER);
381 hpet_writel(cnt, HPET_T1_CMP);
382 hpet_t1_cmp = cnt;
383
384 cfg = hpet_readl(HPET_T1_CFG);
385 cfg &= ~HPET_TN_PERIODIC;
386 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
387 hpet_writel(cfg, HPET_T1_CFG);
388
389 local_irq_restore(flags);
390
391 return 1;
392}
393
394/*
395 * The functions below are called from rtc driver.
396 * Return 0 if HPET is not being used.
397 * Otherwise do the necessary changes and return 1.
398 */
399int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
400{
401 if (!is_hpet_enabled())
402 return 0;
403
404 hpet_rtc_flags &= ~bit_mask;
405 return 1;
406}
407
408int hpet_set_rtc_irq_bit(unsigned long bit_mask)
409{
410 unsigned long oldbits = hpet_rtc_flags;
411
412 if (!is_hpet_enabled())
413 return 0;
414
415 hpet_rtc_flags |= bit_mask;
416
417 if (!oldbits)
418 hpet_rtc_timer_init();
419
420 return 1;
421}
422
423int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
424 unsigned char sec)
425{
426 if (!is_hpet_enabled())
427 return 0;
428
429 hpet_alarm_time.tm_hour = hrs;
430 hpet_alarm_time.tm_min = min;
431 hpet_alarm_time.tm_sec = sec;
432
433 return 1;
434}
435
436int hpet_set_periodic_freq(unsigned long freq)
437{
438 uint64_t clc;
439
440 if (!is_hpet_enabled())
441 return 0;
442
443 if (freq <= DEFAULT_RTC_INT_FREQ)
444 hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
445 else {
446 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
447 do_div(clc, freq);
448 clc >>= hpet_clockevent.shift;
449 hpet_pie_delta = (unsigned long) clc;
450 }
451 return 1;
452}
453
454int hpet_rtc_dropped_irq(void)
455{
456 return is_hpet_enabled();
457}
458
459static void hpet_rtc_timer_reinit(void)
460{
461 unsigned long cfg, delta;
462 int lost_ints = -1;
463
464 if (unlikely(!hpet_rtc_flags)) {
465 cfg = hpet_readl(HPET_T1_CFG);
466 cfg &= ~HPET_TN_ENABLE;
467 hpet_writel(cfg, HPET_T1_CFG);
468 return;
469 }
470
471 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
472 delta = hpet_default_delta;
473 else
474 delta = hpet_pie_delta;
475
476 /*
477 * Increment the comparator value until we are ahead of the
478 * current count.
479 */
480 do {
481 hpet_t1_cmp += delta;
482 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
483 lost_ints++;
484 } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
485
486 if (lost_ints) {
487 if (hpet_rtc_flags & RTC_PIE)
488 hpet_pie_count += lost_ints;
489 if (printk_ratelimit())
490 printk(KERN_WARNING "rtc: lost %d interrupts\n",
491 lost_ints);
492 }
493}
494
495irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
496{
497 struct rtc_time curr_time;
498 unsigned long rtc_int_flag = 0;
499
500 hpet_rtc_timer_reinit();
501
502 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
503 rtc_get_rtc_time(&curr_time);
504
505 if (hpet_rtc_flags & RTC_UIE &&
506 curr_time.tm_sec != hpet_prev_update_sec) {
507 rtc_int_flag = RTC_UF;
508 hpet_prev_update_sec = curr_time.tm_sec;
509 }
510
511 if (hpet_rtc_flags & RTC_PIE &&
512 ++hpet_pie_count >= hpet_pie_limit) {
513 rtc_int_flag |= RTC_PF;
514 hpet_pie_count = 0;
515 }
516
517 if (hpet_rtc_flags & RTC_PIE &&
518 (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
519 (curr_time.tm_min == hpet_alarm_time.tm_min) &&
520 (curr_time.tm_hour == hpet_alarm_time.tm_hour))
521 rtc_int_flag |= RTC_AF;
522
523 if (rtc_int_flag) {
524 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
525 rtc_interrupt(rtc_int_flag, dev_id);
526 }
527 return IRQ_HANDLED;
528}
529#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 9a0060b92e32..a6bc7bb38834 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -2,7 +2,7 @@
2 * i8253.c 8253/PIT functions 2 * i8253.c 8253/PIT functions
3 * 3 *
4 */ 4 */
5#include <linux/clocksource.h> 5#include <linux/clockchips.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/jiffies.h> 7#include <linux/jiffies.h>
8#include <linux/sysdev.h> 8#include <linux/sysdev.h>
@@ -19,17 +19,97 @@
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22void setup_pit_timer(void) 22/*
23 * HPET replaces the PIT, when enabled. So we need to know, which of
24 * the two timers is used
25 */
26struct clock_event_device *global_clock_event;
27
28/*
29 * Initialize the PIT timer.
30 *
31 * This is also called after resume to bring the PIT into operation again.
32 */
33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt)
35{
36 unsigned long flags;
37
38 spin_lock_irqsave(&i8253_lock, flags);
39
40 switch(mode) {
41 case CLOCK_EVT_MODE_PERIODIC:
42 /* binary, mode 2, LSB/MSB, ch 0 */
43 outb_p(0x34, PIT_MODE);
44 udelay(10);
45 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
46 udelay(10);
47 outb(LATCH >> 8 , PIT_CH0); /* MSB */
48 break;
49
50 case CLOCK_EVT_MODE_ONESHOT:
51 case CLOCK_EVT_MODE_SHUTDOWN:
52 case CLOCK_EVT_MODE_UNUSED:
53 /* One shot setup */
54 outb_p(0x38, PIT_MODE);
55 udelay(10);
56 break;
57 }
58 spin_unlock_irqrestore(&i8253_lock, flags);
59}
60
61/*
62 * Program the next event in oneshot mode
63 *
64 * Delta is given in PIT ticks
65 */
66static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
23{ 67{
24 unsigned long flags; 68 unsigned long flags;
25 69
26 spin_lock_irqsave(&i8253_lock, flags); 70 spin_lock_irqsave(&i8253_lock, flags);
27 outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 71 outb_p(delta & 0xff , PIT_CH0); /* LSB */
28 udelay(10); 72 outb(delta >> 8 , PIT_CH0); /* MSB */
29 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
30 udelay(10);
31 outb(LATCH >> 8 , PIT_CH0); /* MSB */
32 spin_unlock_irqrestore(&i8253_lock, flags); 73 spin_unlock_irqrestore(&i8253_lock, flags);
74
75 return 0;
76}
77
78/*
79 * On UP the PIT can serve all of the possible timer functions. On SMP systems
80 * it can be solely used for the global tick.
81 *
82 * The profiling and update capabilites are switched off once the local apic is
83 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
84 * !using_apic_timer decisions in do_timer_interrupt_hook()
85 */
86struct clock_event_device pit_clockevent = {
87 .name = "pit",
88 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
89 .set_mode = init_pit_timer,
90 .set_next_event = pit_next_event,
91 .shift = 32,
92 .irq = 0,
93};
94
95/*
96 * Initialize the conversion factor and the min/max deltas of the clock event
97 * structure and register the clock event source with the framework.
98 */
99void __init setup_pit_timer(void)
100{
101 /*
102 * Start pit with the boot cpu mask and make it global after the
103 * IO_APIC has been initialized.
104 */
105 pit_clockevent.cpumask = cpumask_of_cpu(0);
106 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
107 pit_clockevent.max_delta_ns =
108 clockevent_delta2ns(0x7FFF, &pit_clockevent);
109 pit_clockevent.min_delta_ns =
110 clockevent_delta2ns(0xF, &pit_clockevent);
111 clockevents_register_device(&pit_clockevent);
112 global_clock_event = &pit_clockevent;
33} 113}
34 114
35/* 115/*
@@ -46,7 +126,7 @@ static cycle_t pit_read(void)
46 static u32 old_jifs; 126 static u32 old_jifs;
47 127
48 spin_lock_irqsave(&i8253_lock, flags); 128 spin_lock_irqsave(&i8253_lock, flags);
49 /* 129 /*
50 * Although our caller may have the read side of xtime_lock, 130 * Although our caller may have the read side of xtime_lock,
51 * this is now a seqlock, and we are cheating in this routine 131 * this is now a seqlock, and we are cheating in this routine
52 * by having side effects on state that we cannot undo if 132 * by having side effects on state that we cannot undo if
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index c8d45821c788..03abfdb1a6e4 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int);
41static struct irq_chip i8259A_chip = { 41static struct irq_chip i8259A_chip = {
42 .name = "XT-PIC", 42 .name = "XT-PIC",
43 .mask = disable_8259A_irq, 43 .mask = disable_8259A_irq,
44 .disable = disable_8259A_irq,
44 .unmask = enable_8259A_irq, 45 .unmask = enable_8259A_irq,
45 .mask_ack = mask_and_ack_8259A, 46 .mask_ack = mask_and_ack_8259A,
46}; 47};
@@ -410,12 +411,6 @@ void __init native_init_IRQ(void)
410 intr_init_hook(); 411 intr_init_hook();
411 412
412 /* 413 /*
413 * Set the clock to HZ Hz, we already have a valid
414 * vector now:
415 */
416 setup_pit_timer();
417
418 /*
419 * External FPU? Set up irq13 if so, for 414 * External FPU? Set up irq13 if so, for
420 * original braindamaged IBM FERR coupling. 415 * original braindamaged IBM FERR coupling.
421 */ 416 */
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index e30ccedad0b9..4ccebd454e25 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -482,8 +482,8 @@ static void do_irq_balance(void)
482 package_index = CPU_TO_PACKAGEINDEX(i); 482 package_index = CPU_TO_PACKAGEINDEX(i);
483 for (j = 0; j < NR_IRQS; j++) { 483 for (j = 0; j < NR_IRQS; j++) {
484 unsigned long value_now, delta; 484 unsigned long value_now, delta;
485 /* Is this an active IRQ? */ 485 /* Is this an active IRQ or balancing disabled ? */
486 if (!irq_desc[j].action) 486 if (!irq_desc[j].action || irq_balancing_disabled(j))
487 continue; 487 continue;
488 if ( package_index == i ) 488 if ( package_index == i )
489 IRQ_DELTA(package_index,j) = 0; 489 IRQ_DELTA(package_index,j) = 0;
@@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1281 trigger == IOAPIC_LEVEL) 1281 trigger == IOAPIC_LEVEL)
1282 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1282 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1283 handle_fasteoi_irq, "fasteoi"); 1283 handle_fasteoi_irq, "fasteoi");
1284 else { 1284 else
1285 irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1286 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1285 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1287 handle_edge_irq, "edge"); 1286 handle_edge_irq, "edge");
1288 }
1289 set_intr_gate(vector, interrupt[irq]); 1287 set_intr_gate(vector, interrupt[irq]);
1290} 1288}
1291 1289
@@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy)
1588 v = apic_read(APIC_LVR); 1586 v = apic_read(APIC_LVR);
1589 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1587 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1590 ver = GET_APIC_VERSION(v); 1588 ver = GET_APIC_VERSION(v);
1591 maxlvt = get_maxlvt(); 1589 maxlvt = lapic_get_maxlvt();
1592 1590
1593 v = apic_read(APIC_TASKPRI); 1591 v = apic_read(APIC_TASKPRI);
1594 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1592 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 5785d84103a6..0f2ca590bf23 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -10,7 +10,6 @@
10 * io_apic.c.) 10 * io_apic.c.)
11 */ 11 */
12 12
13#include <asm/uaccess.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/seq_file.h> 14#include <linux/seq_file.h>
16#include <linux/interrupt.h> 15#include <linux/interrupt.h>
@@ -21,19 +20,34 @@
21 20
22#include <asm/idle.h> 21#include <asm/idle.h>
23 22
23#include <asm/apic.h>
24#include <asm/uaccess.h>
25
24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; 26DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
25EXPORT_PER_CPU_SYMBOL(irq_stat); 27EXPORT_PER_CPU_SYMBOL(irq_stat);
26 28
27#ifndef CONFIG_X86_LOCAL_APIC
28/* 29/*
29 * 'what should we do if we get a hw irq event on an illegal vector'. 30 * 'what should we do if we get a hw irq event on an illegal vector'.
30 * each architecture has to answer this themselves. 31 * each architecture has to answer this themselves.
31 */ 32 */
32void ack_bad_irq(unsigned int irq) 33void ack_bad_irq(unsigned int irq)
33{ 34{
34 printk("unexpected IRQ trap at vector %02x\n", irq); 35 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
35} 36
37#ifdef CONFIG_X86_LOCAL_APIC
38 /*
39 * Currently unexpected vectors happen only on SMP and APIC.
40 * We _must_ ack these because every local APIC has only N
41 * irq slots per priority level, and a 'hanging, unacked' IRQ
42 * holds up an irq slot - in excessive cases (when multiple
43 * unexpected vectors occur) that might lock up the APIC
44 * completely.
45 * But only ack when the APIC is enabled -AK
46 */
47 if (cpu_has_apic)
48 ack_APIC_irq();
36#endif 49#endif
50}
37 51
38#ifdef CONFIG_4KSTACKS 52#ifdef CONFIG_4KSTACKS
39/* 53/*
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 5d8a07c20281..821df34d2b3a 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -23,6 +23,7 @@
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/kernel_stat.h>
26 27
27#include <asm/smp.h> 28#include <asm/smp.h>
28#include <asm/nmi.h> 29#include <asm/nmi.h>
@@ -973,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
973 cpu_clear(cpu, backtrace_mask); 974 cpu_clear(cpu, backtrace_mask);
974 } 975 }
975 976
976 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 977 /*
978 * Take the local apic timer and PIT/HPET into account. We don't
979 * know which one is active, when we have highres/dyntick on
980 */
981 sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
977 982
978 /* if the apic timer isn't firing, this cpu isn't doing much */ 983 /* if the none of the timers isn't firing, this cpu isn't doing much */
979 if (!touched && last_irq_sums[cpu] == sum) { 984 if (!touched && last_irq_sums[cpu] == sum) {
980 /* 985 /*
981 * Ayiee, looks like this CPU is stuck ... 986 * Ayiee, looks like this CPU is stuck ...
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 7845d480c293..bea304d48cdb 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -38,6 +38,7 @@
38#include <linux/ptrace.h> 38#include <linux/ptrace.h>
39#include <linux/random.h> 39#include <linux/random.h>
40#include <linux/personality.h> 40#include <linux/personality.h>
41#include <linux/tick.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -211,6 +212,7 @@ void cpu_idle(void)
211 212
212 /* endless idle loop with no priority at all */ 213 /* endless idle loop with no priority at all */
213 while (1) { 214 while (1) {
215 tick_nohz_stop_sched_tick();
214 while (!need_resched()) { 216 while (!need_resched()) {
215 void (*idle)(void); 217 void (*idle)(void);
216 218
@@ -238,6 +240,7 @@ void cpu_idle(void)
238 idle(); 240 idle();
239 __exit_idle(); 241 __exit_idle();
240 } 242 }
243 tick_nohz_restart_sched_tick();
241 preempt_enable_no_resched(); 244 preempt_enable_no_resched();
242 schedule(); 245 schedule();
243 preempt_disable(); 246 preempt_disable();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index f46a4d095e6c..48bfcaa13ecc 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -94,12 +94,6 @@ cpumask_t cpu_possible_map;
94EXPORT_SYMBOL(cpu_possible_map); 94EXPORT_SYMBOL(cpu_possible_map);
95static cpumask_t smp_commenced_mask; 95static cpumask_t smp_commenced_mask;
96 96
97/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
98 * is no way to resync one AP against BP. TBD: for prescott and above, we
99 * should use IA64's algorithm
100 */
101static int __devinitdata tsc_sync_disabled;
102
103/* Per CPU bogomips and other parameters */ 97/* Per CPU bogomips and other parameters */
104struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 98struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
105EXPORT_SYMBOL(cpu_data); 99EXPORT_SYMBOL(cpu_data);
@@ -216,151 +210,6 @@ valid_k7:
216 ; 210 ;
217} 211}
218 212
219/*
220 * TSC synchronization.
221 *
222 * We first check whether all CPUs have their TSC's synchronized,
223 * then we print a warning if not, and always resync.
224 */
225
226static struct {
227 atomic_t start_flag;
228 atomic_t count_start;
229 atomic_t count_stop;
230 unsigned long long values[NR_CPUS];
231} tsc __cpuinitdata = {
232 .start_flag = ATOMIC_INIT(0),
233 .count_start = ATOMIC_INIT(0),
234 .count_stop = ATOMIC_INIT(0),
235};
236
237#define NR_LOOPS 5
238
239static void __init synchronize_tsc_bp(void)
240{
241 int i;
242 unsigned long long t0;
243 unsigned long long sum, avg;
244 long long delta;
245 unsigned int one_usec;
246 int buggy = 0;
247
248 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
249
250 /* convert from kcyc/sec to cyc/usec */
251 one_usec = cpu_khz / 1000;
252
253 atomic_set(&tsc.start_flag, 1);
254 wmb();
255
256 /*
257 * We loop a few times to get a primed instruction cache,
258 * then the last pass is more or less synchronized and
259 * the BP and APs set their cycle counters to zero all at
260 * once. This reduces the chance of having random offsets
261 * between the processors, and guarantees that the maximum
262 * delay between the cycle counters is never bigger than
263 * the latency of information-passing (cachelines) between
264 * two CPUs.
265 */
266 for (i = 0; i < NR_LOOPS; i++) {
267 /*
268 * all APs synchronize but they loop on '== num_cpus'
269 */
270 while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
271 cpu_relax();
272 atomic_set(&tsc.count_stop, 0);
273 wmb();
274 /*
275 * this lets the APs save their current TSC:
276 */
277 atomic_inc(&tsc.count_start);
278
279 rdtscll(tsc.values[smp_processor_id()]);
280 /*
281 * We clear the TSC in the last loop:
282 */
283 if (i == NR_LOOPS-1)
284 write_tsc(0, 0);
285
286 /*
287 * Wait for all APs to leave the synchronization point:
288 */
289 while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
290 cpu_relax();
291 atomic_set(&tsc.count_start, 0);
292 wmb();
293 atomic_inc(&tsc.count_stop);
294 }
295
296 sum = 0;
297 for (i = 0; i < NR_CPUS; i++) {
298 if (cpu_isset(i, cpu_callout_map)) {
299 t0 = tsc.values[i];
300 sum += t0;
301 }
302 }
303 avg = sum;
304 do_div(avg, num_booting_cpus());
305
306 for (i = 0; i < NR_CPUS; i++) {
307 if (!cpu_isset(i, cpu_callout_map))
308 continue;
309 delta = tsc.values[i] - avg;
310 if (delta < 0)
311 delta = -delta;
312 /*
313 * We report bigger than 2 microseconds clock differences.
314 */
315 if (delta > 2*one_usec) {
316 long long realdelta;
317
318 if (!buggy) {
319 buggy = 1;
320 printk("\n");
321 }
322 realdelta = delta;
323 do_div(realdelta, one_usec);
324 if (tsc.values[i] < avg)
325 realdelta = -realdelta;
326
327 if (realdelta)
328 printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
329 "skew, fixed it up.\n", i, realdelta);
330 }
331 }
332 if (!buggy)
333 printk("passed.\n");
334}
335
336static void __cpuinit synchronize_tsc_ap(void)
337{
338 int i;
339
340 /*
341 * Not every cpu is online at the time
342 * this gets called, so we first wait for the BP to
343 * finish SMP initialization:
344 */
345 while (!atomic_read(&tsc.start_flag))
346 cpu_relax();
347
348 for (i = 0; i < NR_LOOPS; i++) {
349 atomic_inc(&tsc.count_start);
350 while (atomic_read(&tsc.count_start) != num_booting_cpus())
351 cpu_relax();
352
353 rdtscll(tsc.values[smp_processor_id()]);
354 if (i == NR_LOOPS-1)
355 write_tsc(0, 0);
356
357 atomic_inc(&tsc.count_stop);
358 while (atomic_read(&tsc.count_stop) != num_booting_cpus())
359 cpu_relax();
360 }
361}
362#undef NR_LOOPS
363
364extern void calibrate_delay(void); 213extern void calibrate_delay(void);
365 214
366static atomic_t init_deasserted; 215static atomic_t init_deasserted;
@@ -438,20 +287,12 @@ static void __cpuinit smp_callin(void)
438 /* 287 /*
439 * Save our processor parameters 288 * Save our processor parameters
440 */ 289 */
441 smp_store_cpu_info(cpuid); 290 smp_store_cpu_info(cpuid);
442
443 disable_APIC_timer();
444 291
445 /* 292 /*
446 * Allow the master to continue. 293 * Allow the master to continue.
447 */ 294 */
448 cpu_set(cpuid, cpu_callin_map); 295 cpu_set(cpuid, cpu_callin_map);
449
450 /*
451 * Synchronize the TSC with the BP
452 */
453 if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
454 synchronize_tsc_ap();
455} 296}
456 297
457static int cpucount; 298static int cpucount;
@@ -554,13 +395,17 @@ static void __cpuinit start_secondary(void *unused)
554 smp_callin(); 395 smp_callin();
555 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 396 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
556 rep_nop(); 397 rep_nop();
398 /*
399 * Check TSC synchronization with the BP:
400 */
401 check_tsc_sync_target();
402
557 setup_secondary_clock(); 403 setup_secondary_clock();
558 if (nmi_watchdog == NMI_IO_APIC) { 404 if (nmi_watchdog == NMI_IO_APIC) {
559 disable_8259A_irq(0); 405 disable_8259A_irq(0);
560 enable_NMI_through_LVT0(NULL); 406 enable_NMI_through_LVT0(NULL);
561 enable_8259A_irq(0); 407 enable_8259A_irq(0);
562 } 408 }
563 enable_APIC_timer();
564 /* 409 /*
565 * low-memory mappings have been cleared, flush them from 410 * low-memory mappings have been cleared, flush them from
566 * the local TLBs too. 411 * the local TLBs too.
@@ -752,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
752 /* 597 /*
753 * Due to the Pentium erratum 3AP. 598 * Due to the Pentium erratum 3AP.
754 */ 599 */
755 maxlvt = get_maxlvt(); 600 maxlvt = lapic_get_maxlvt();
756 if (maxlvt > 3) { 601 if (maxlvt > 3) {
757 apic_read_around(APIC_SPIV); 602 apic_read_around(APIC_SPIV);
758 apic_write(APIC_ESR, 0); 603 apic_write(APIC_ESR, 0);
@@ -849,7 +694,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
849 */ 694 */
850 Dprintk("#startup loops: %d.\n", num_starts); 695 Dprintk("#startup loops: %d.\n", num_starts);
851 696
852 maxlvt = get_maxlvt(); 697 maxlvt = lapic_get_maxlvt();
853 698
854 for (j = 1; j <= num_starts; j++) { 699 for (j = 1; j <= num_starts; j++) {
855 Dprintk("Sending STARTUP #%d.\n",j); 700 Dprintk("Sending STARTUP #%d.\n",j);
@@ -1125,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1125 info.cpu = cpu; 970 info.cpu = cpu;
1126 INIT_WORK(&info.task, do_warm_boot_cpu); 971 INIT_WORK(&info.task, do_warm_boot_cpu);
1127 972
1128 tsc_sync_disabled = 1;
1129
1130 /* init low mem mapping */ 973 /* init low mem mapping */
1131 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 974 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
1132 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 975 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
@@ -1134,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1134 schedule_work(&info.task); 977 schedule_work(&info.task);
1135 wait_for_completion(&done); 978 wait_for_completion(&done);
1136 979
1137 tsc_sync_disabled = 0;
1138 zap_low_mappings(); 980 zap_low_mappings();
1139 ret = 0; 981 ret = 0;
1140exit: 982exit:
@@ -1331,12 +1173,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1331 smpboot_setup_io_apic(); 1173 smpboot_setup_io_apic();
1332 1174
1333 setup_boot_clock(); 1175 setup_boot_clock();
1334
1335 /*
1336 * Synchronize the TSC with the AP
1337 */
1338 if (cpu_has_tsc && cpucount && cpu_khz)
1339 synchronize_tsc_bp();
1340} 1176}
1341 1177
1342/* These are wrappers to interface to the new boot process. Someone 1178/* These are wrappers to interface to the new boot process. Someone
@@ -1471,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu)
1471 } 1307 }
1472 1308
1473 local_irq_enable(); 1309 local_irq_enable();
1310
1474 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1475 /* Unleash the CPU! */ 1312 /* Unleash the CPU! */
1476 cpu_set(cpu, smp_commenced_mask); 1313 cpu_set(cpu, smp_commenced_mask);
1314
1315 /*
1316 * Check TSC synchronization with the AP:
1317 */
1318 check_tsc_sync_source(cpu);
1319
1477 while (!cpu_isset(cpu, cpu_online_map)) 1320 while (!cpu_isset(cpu, cpu_online_map))
1478 cpu_relax(); 1321 cpu_relax();
1479 1322
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index a4f67a6e6821..a5350059557a 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -159,15 +159,6 @@ EXPORT_SYMBOL(profile_pc);
159 */ 159 */
160irqreturn_t timer_interrupt(int irq, void *dev_id) 160irqreturn_t timer_interrupt(int irq, void *dev_id)
161{ 161{
162 /*
163 * Here we are in the timer irq handler. We just have irqs locally
164 * disabled but we don't know if the timer_bh is running on the other
165 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
166 * the irq version of write_lock because as just said we have irq
167 * locally disabled. -arca
168 */
169 write_seqlock(&xtime_lock);
170
171#ifdef CONFIG_X86_IO_APIC 162#ifdef CONFIG_X86_IO_APIC
172 if (timer_ack) { 163 if (timer_ack) {
173 /* 164 /*
@@ -186,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
186 177
187 do_timer_interrupt_hook(); 178 do_timer_interrupt_hook();
188 179
189
190 if (MCA_bus) { 180 if (MCA_bus) {
191 /* The PS/2 uses level-triggered interrupts. You can't 181 /* The PS/2 uses level-triggered interrupts. You can't
192 turn them off, nor would you want to (any attempt to 182 turn them off, nor would you want to (any attempt to
@@ -201,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
201 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 191 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
202 } 192 }
203 193
204 write_sequnlock(&xtime_lock);
205
206#ifdef CONFIG_X86_LOCAL_APIC
207 if (using_apic_timer)
208 smp_send_timer_broadcast_ipi();
209#endif
210
211 return IRQ_HANDLED; 194 return IRQ_HANDLED;
212} 195}
213 196
214/* not static: needed by APM */ 197/* not static: needed by APM */
215unsigned long get_cmos_time(void) 198unsigned long read_persistent_clock(void)
216{ 199{
217 unsigned long retval; 200 unsigned long retval;
218 unsigned long flags; 201 unsigned long flags;
@@ -225,7 +208,6 @@ unsigned long get_cmos_time(void)
225 208
226 return retval; 209 return retval;
227} 210}
228EXPORT_SYMBOL(get_cmos_time);
229 211
230static void sync_cmos_clock(unsigned long dummy); 212static void sync_cmos_clock(unsigned long dummy);
231 213
@@ -278,114 +260,16 @@ void notify_arch_cmos_timer(void)
278 mod_timer(&sync_cmos_timer, jiffies + 1); 260 mod_timer(&sync_cmos_timer, jiffies + 1);
279} 261}
280 262
281static long clock_cmos_diff;
282static unsigned long sleep_start;
283
284static int timer_suspend(struct sys_device *dev, pm_message_t state)
285{
286 /*
287 * Estimate time zone so that set_time can update the clock
288 */
289 unsigned long ctime = get_cmos_time();
290
291 clock_cmos_diff = -ctime;
292 clock_cmos_diff += get_seconds();
293 sleep_start = ctime;
294 return 0;
295}
296
297static int timer_resume(struct sys_device *dev)
298{
299 unsigned long flags;
300 unsigned long sec;
301 unsigned long ctime = get_cmos_time();
302 long sleep_length = (ctime - sleep_start) * HZ;
303 struct timespec ts;
304
305 if (sleep_length < 0) {
306 printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
307 /* The time after the resume must not be earlier than the time
308 * before the suspend or some nasty things will happen
309 */
310 sleep_length = 0;
311 ctime = sleep_start;
312 }
313#ifdef CONFIG_HPET_TIMER
314 if (is_hpet_enabled())
315 hpet_reenable();
316#endif
317 setup_pit_timer();
318
319 sec = ctime + clock_cmos_diff;
320 ts.tv_sec = sec;
321 ts.tv_nsec = 0;
322 do_settimeofday(&ts);
323 write_seqlock_irqsave(&xtime_lock, flags);
324 jiffies_64 += sleep_length;
325 write_sequnlock_irqrestore(&xtime_lock, flags);
326 touch_softlockup_watchdog();
327 return 0;
328}
329
330static struct sysdev_class timer_sysclass = {
331 .resume = timer_resume,
332 .suspend = timer_suspend,
333 set_kset_name("timer"),
334};
335
336
337/* XXX this driverfs stuff should probably go elsewhere later -john */
338static struct sys_device device_timer = {
339 .id = 0,
340 .cls = &timer_sysclass,
341};
342
343static int time_init_device(void)
344{
345 int error = sysdev_class_register(&timer_sysclass);
346 if (!error)
347 error = sysdev_register(&device_timer);
348 return error;
349}
350
351device_initcall(time_init_device);
352
353#ifdef CONFIG_HPET_TIMER
354extern void (*late_time_init)(void); 263extern void (*late_time_init)(void);
355/* Duplicate of time_init() below, with hpet_enable part added */ 264/* Duplicate of time_init() below, with hpet_enable part added */
356static void __init hpet_time_init(void) 265static void __init hpet_time_init(void)
357{ 266{
358 struct timespec ts; 267 if (!hpet_enable())
359 ts.tv_sec = get_cmos_time(); 268 setup_pit_timer();
360 ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
361
362 do_settimeofday(&ts);
363
364 if ((hpet_enable() >= 0) && hpet_use_timer) {
365 printk("Using HPET for base-timer\n");
366 }
367
368 do_time_init(); 269 do_time_init();
369} 270}
370#endif
371 271
372void __init time_init(void) 272void __init time_init(void)
373{ 273{
374 struct timespec ts; 274 late_time_init = hpet_time_init;
375#ifdef CONFIG_HPET_TIMER
376 if (is_hpet_capable()) {
377 /*
378 * HPET initialization needs to do memory-mapped io. So, let
379 * us do a late initialization after mem_init().
380 */
381 late_time_init = hpet_time_init;
382 return;
383 }
384#endif
385 ts.tv_sec = get_cmos_time();
386 ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
387
388 do_settimeofday(&ts);
389
390 do_time_init();
391} 275}
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 46f752a8bbf3..3082a418635c 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -60,12 +60,6 @@ static inline int check_tsc_unstable(void)
60 return tsc_unstable; 60 return tsc_unstable;
61} 61}
62 62
63void mark_tsc_unstable(void)
64{
65 tsc_unstable = 1;
66}
67EXPORT_SYMBOL_GPL(mark_tsc_unstable);
68
69/* Accellerators for sched_clock() 63/* Accellerators for sched_clock()
70 * convert from cycles(64bits) => nanoseconds (64bits) 64 * convert from cycles(64bits) => nanoseconds (64bits)
71 * basic equation: 65 * basic equation:
@@ -222,34 +216,6 @@ out_no_tsc:
222 216
223#ifdef CONFIG_CPU_FREQ 217#ifdef CONFIG_CPU_FREQ
224 218
225static unsigned int cpufreq_delayed_issched = 0;
226static unsigned int cpufreq_init = 0;
227static struct work_struct cpufreq_delayed_get_work;
228
229static void handle_cpufreq_delayed_get(struct work_struct *work)
230{
231 unsigned int cpu;
232
233 for_each_online_cpu(cpu)
234 cpufreq_get(cpu);
235
236 cpufreq_delayed_issched = 0;
237}
238
239/*
240 * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
241 * to verify the CPU frequency the timing core thinks the CPU is running
242 * at is still correct.
243 */
244static inline void cpufreq_delayed_get(void)
245{
246 if (cpufreq_init && !cpufreq_delayed_issched) {
247 cpufreq_delayed_issched = 1;
248 printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
249 schedule_work(&cpufreq_delayed_get_work);
250 }
251}
252
253/* 219/*
254 * if the CPU frequency is scaled, TSC-based delays will need a different 220 * if the CPU frequency is scaled, TSC-based delays will need a different
255 * loops_per_jiffy value to function properly. 221 * loops_per_jiffy value to function properly.
@@ -313,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
313 279
314static int __init cpufreq_tsc(void) 280static int __init cpufreq_tsc(void)
315{ 281{
316 int ret; 282 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
317 283 CPUFREQ_TRANSITION_NOTIFIER);
318 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
319 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
320 CPUFREQ_TRANSITION_NOTIFIER);
321 if (!ret)
322 cpufreq_init = 1;
323
324 return ret;
325} 284}
326
327core_initcall(cpufreq_tsc); 285core_initcall(cpufreq_tsc);
328 286
329#endif 287#endif
@@ -331,7 +289,6 @@ core_initcall(cpufreq_tsc);
331/* clock source code */ 289/* clock source code */
332 290
333static unsigned long current_tsc_khz = 0; 291static unsigned long current_tsc_khz = 0;
334static int tsc_update_callback(void);
335 292
336static cycle_t read_tsc(void) 293static cycle_t read_tsc(void)
337{ 294{
@@ -349,37 +306,28 @@ static struct clocksource clocksource_tsc = {
349 .mask = CLOCKSOURCE_MASK(64), 306 .mask = CLOCKSOURCE_MASK(64),
350 .mult = 0, /* to be set */ 307 .mult = 0, /* to be set */
351 .shift = 22, 308 .shift = 22,
352 .update_callback = tsc_update_callback, 309 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
353 .is_continuous = 1, 310 CLOCK_SOURCE_MUST_VERIFY,
354}; 311};
355 312
356static int tsc_update_callback(void) 313void mark_tsc_unstable(void)
357{ 314{
358 int change = 0; 315 if (!tsc_unstable) {
359 316 tsc_unstable = 1;
360 /* check to see if we should switch to the safe clocksource: */ 317 /* Can be called before registration */
361 if (clocksource_tsc.rating != 0 && check_tsc_unstable()) { 318 if (clocksource_tsc.mult)
362 clocksource_tsc.rating = 0; 319 clocksource_change_rating(&clocksource_tsc, 0);
363 clocksource_reselect(); 320 else
364 change = 1; 321 clocksource_tsc.rating = 0;
365 }
366
367 /* only update if tsc_khz has changed: */
368 if (current_tsc_khz != tsc_khz) {
369 current_tsc_khz = tsc_khz;
370 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
371 clocksource_tsc.shift);
372 change = 1;
373 } 322 }
374
375 return change;
376} 323}
324EXPORT_SYMBOL_GPL(mark_tsc_unstable);
377 325
378static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) 326static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
379{ 327{
380 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", 328 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
381 d->ident); 329 d->ident);
382 mark_tsc_unstable(); 330 tsc_unstable = 1;
383 return 0; 331 return 0;
384} 332}
385 333
@@ -396,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
396 {} 344 {}
397}; 345};
398 346
399#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */
400static struct timer_list verify_tsc_freq_timer;
401
402/* XXX - Probably should add locking */
403static void verify_tsc_freq(unsigned long unused)
404{
405 static u64 last_tsc;
406 static unsigned long last_jiffies;
407
408 u64 now_tsc, interval_tsc;
409 unsigned long now_jiffies, interval_jiffies;
410
411
412 if (check_tsc_unstable())
413 return;
414
415 rdtscll(now_tsc);
416 now_jiffies = jiffies;
417
418 if (!last_jiffies) {
419 goto out;
420 }
421
422 interval_jiffies = now_jiffies - last_jiffies;
423 interval_tsc = now_tsc - last_tsc;
424 interval_tsc *= HZ;
425 do_div(interval_tsc, cpu_khz*1000);
426
427 if (interval_tsc < (interval_jiffies * 3 / 4)) {
428 printk("TSC appears to be running slowly. "
429 "Marking it as unstable\n");
430 mark_tsc_unstable();
431 return;
432 }
433
434out:
435 last_tsc = now_tsc;
436 last_jiffies = now_jiffies;
437 /* set us up to go off on the next interval: */
438 mod_timer(&verify_tsc_freq_timer,
439 jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL));
440}
441
442/* 347/*
443 * Make an educated guess if the TSC is trustworthy and synchronized 348 * Make an educated guess if the TSC is trustworthy and synchronized
444 * over all CPUs. 349 * over all CPUs.
445 */ 350 */
446static __init int unsynchronized_tsc(void) 351__cpuinit int unsynchronized_tsc(void)
447{ 352{
353 if (!cpu_has_tsc || tsc_unstable)
354 return 1;
448 /* 355 /*
449 * Intel systems are normally all synchronized. 356 * Intel systems are normally all synchronized.
450 * Exceptions must mark TSC as unstable: 357 * Exceptions must mark TSC as unstable:
451 */ 358 */
452 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) 359 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
453 return 0; 360 /* assume multi socket systems are not synchronized: */
361 if (num_possible_cpus() > 1)
362 tsc_unstable = 1;
363 }
364 return tsc_unstable;
365}
366
367/*
368 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
369 */
370#ifdef CONFIG_MGEODE_LX
371/* RTSC counts during suspend */
372#define RTSC_SUSP 0x100
373
374static void __init check_geode_tsc_reliable(void)
375{
376 unsigned long val;
454 377
455 /* assume multi socket systems are not synchronized: */ 378 rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
456 return num_possible_cpus() > 1; 379 if ((val & RTSC_SUSP))
380 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
457} 381}
382#else
383static inline void check_geode_tsc_reliable(void) { }
384#endif
458 385
459static int __init init_tsc_clocksource(void) 386static int __init init_tsc_clocksource(void)
460{ 387{
@@ -463,20 +390,16 @@ static int __init init_tsc_clocksource(void)
463 /* check blacklist */ 390 /* check blacklist */
464 dmi_check_system(bad_tsc_dmi_table); 391 dmi_check_system(bad_tsc_dmi_table);
465 392
466 if (unsynchronized_tsc()) /* mark unstable if unsynced */ 393 unsynchronized_tsc();
467 mark_tsc_unstable(); 394 check_geode_tsc_reliable();
468 current_tsc_khz = tsc_khz; 395 current_tsc_khz = tsc_khz;
469 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, 396 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
470 clocksource_tsc.shift); 397 clocksource_tsc.shift);
471 /* lower the rating if we already know its unstable: */ 398 /* lower the rating if we already know its unstable: */
472 if (check_tsc_unstable()) 399 if (check_tsc_unstable()) {
473 clocksource_tsc.rating = 0; 400 clocksource_tsc.rating = 0;
474 401 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
475 init_timer(&verify_tsc_freq_timer); 402 }
476 verify_tsc_freq_timer.function = verify_tsc_freq;
477 verify_tsc_freq_timer.expires =
478 jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
479 add_timer(&verify_tsc_freq_timer);
480 403
481 return clocksource_register(&clocksource_tsc); 404 return clocksource_register(&clocksource_tsc);
482 } 405 }
diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c
new file mode 100644
index 000000000000..12424629af87
--- /dev/null
+++ b/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
index 2e2d8dbcbd68..76d2adcae5a3 100644
--- a/arch/i386/kernel/vmitime.c
+++ b/arch/i386/kernel/vmitime.c
@@ -115,7 +115,7 @@ static struct clocksource clocksource_vmi = {
115 .mask = CLOCKSOURCE_MASK(64), 115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */ 116 .mult = 0, /* to be set */
117 .shift = 22, 117 .shift = 22,
118 .is_continuous = 1, 118 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
119}; 119};
120 120
121 121
diff --git a/arch/i386/mach-default/setup.c b/arch/i386/mach-default/setup.c
index cc2f519b2f7f..c78816210706 100644
--- a/arch/i386/mach-default/setup.c
+++ b/arch/i386/mach-default/setup.c
@@ -79,7 +79,12 @@ void __init trap_init_hook(void)
79{ 79{
80} 80}
81 81
82static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; 82static struct irqaction irq0 = {
83 .handler = timer_interrupt,
84 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
85 .mask = CPU_MASK_NONE,
86 .name = "timer"
87};
83 88
84/** 89/**
85 * time_init_hook - do any specific initialisations for the system timer. 90 * time_init_hook - do any specific initialisations for the system timer.
@@ -90,6 +95,7 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE,
90 **/ 95 **/
91void __init time_init_hook(void) 96void __init time_init_hook(void)
92{ 97{
98 irq0.mask = cpumask_of_cpu(0);
93 setup_irq(0, &irq0); 99 setup_irq(0, &irq0);
94} 100}
95 101
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 545fcbc8cea2..e5e56bd498db 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -307,7 +307,7 @@ static unsigned int __init calibrate_hpt(void)
307struct clocksource clocksource_mips = { 307struct clocksource clocksource_mips = {
308 .name = "MIPS", 308 .name = "MIPS",
309 .mask = 0xffffffff, 309 .mask = 0xffffffff,
310 .is_continuous = 1, 310 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
311}; 311};
312 312
313static void __init init_mips_clocksource(void) 313static void __init init_mips_clocksource(void)
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 39db12890214..5e5c0e4add91 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -305,8 +305,6 @@ static int pmac_pic_host_map(struct irq_host *h, unsigned int virq,
305 level = !!(level_mask[hw >> 5] & (1UL << (hw & 0x1f))); 305 level = !!(level_mask[hw >> 5] & (1UL << (hw & 0x1f)));
306 if (level) 306 if (level)
307 desc->status |= IRQ_LEVEL; 307 desc->status |= IRQ_LEVEL;
308 else
309 desc->status |= IRQ_DELAYED_DISABLE;
310 set_irq_chip_and_handler(virq, &pmac_pic, level ? 308 set_irq_chip_and_handler(virq, &pmac_pic, level ?
311 handle_level_irq : handle_edge_irq); 309 handle_level_irq : handle_edge_irq);
312 return 0; 310 return 0;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 3b91f27ab202..ee9fd7b85928 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -312,7 +312,7 @@ static struct clocksource clocksource_tod = {
312 .mask = -1ULL, 312 .mask = -1ULL,
313 .mult = 1000, 313 .mult = 1000,
314 .shift = 12, 314 .shift = 12,
315 .is_continuous = 1, 315 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
316}; 316};
317 317
318 318
diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index 925a65240cfe..b2e1fd8e3571 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -97,20 +97,22 @@ static int write_sigio_thread(void *unused)
97 97
98static int need_poll(struct pollfds *polls, int n) 98static int need_poll(struct pollfds *polls, int n)
99{ 99{
100 if(n <= polls->size){ 100 struct pollfd *new;
101 polls->used = n; 101
102 if(n <= polls->size)
102 return 0; 103 return 0;
103 } 104
104 kfree(polls->poll); 105 new = um_kmalloc_atomic(n * sizeof(struct pollfd));
105 polls->poll = um_kmalloc_atomic(n * sizeof(struct pollfd)); 106 if(new == NULL){
106 if(polls->poll == NULL){
107 printk("need_poll : failed to allocate new pollfds\n"); 107 printk("need_poll : failed to allocate new pollfds\n");
108 polls->size = 0;
109 polls->used = 0;
110 return -ENOMEM; 108 return -ENOMEM;
111 } 109 }
110
111 memcpy(new, polls->poll, polls->used * sizeof(struct pollfd));
112 kfree(polls->poll);
113
114 polls->poll = new;
112 polls->size = n; 115 polls->size = n;
113 polls->used = n;
114 return 0; 116 return 0;
115} 117}
116 118
@@ -171,15 +173,15 @@ int add_sigio_fd(int fd)
171 goto out; 173 goto out;
172 } 174 }
173 175
174 n = current_poll.used + 1; 176 n = current_poll.used;
175 err = need_poll(&next_poll, n); 177 err = need_poll(&next_poll, n + 1);
176 if(err) 178 if(err)
177 goto out; 179 goto out;
178 180
179 for(i = 0; i < current_poll.used; i++) 181 memcpy(next_poll.poll, current_poll.poll,
180 next_poll.poll[i] = current_poll.poll[i]; 182 current_poll.used * sizeof(struct pollfd));
181 183 next_poll.poll[n] = *p;
182 next_poll.poll[n - 1] = *p; 184 next_poll.used = n + 1;
183 update_thread(); 185 update_thread();
184 out: 186 out:
185 sigio_unlock(); 187 sigio_unlock();
@@ -214,6 +216,7 @@ int ignore_sigio_fd(int fd)
214 if(p->fd != fd) 216 if(p->fd != fd)
215 next_poll.poll[n++] = *p; 217 next_poll.poll[n++] = *p;
216 } 218 }
219 next_poll.used = current_poll.used - 1;
217 220
218 update_thread(); 221 update_thread();
219 out: 222 out:
@@ -331,10 +334,9 @@ void maybe_sigio_broken(int fd, int read)
331 334
332 sigio_lock(); 335 sigio_lock();
333 err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); 336 err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1);
334 if(err){ 337 if(err)
335 printk("maybe_sigio_broken - failed to add pollfd\n");
336 goto out; 338 goto out;
337 } 339
338 all_sigio_fds.poll[all_sigio_fds.used++] = 340 all_sigio_fds.poll[all_sigio_fds.used++] =
339 ((struct pollfd) { .fd = fd, 341 ((struct pollfd) { .fd = fd,
340 .events = read ? POLLIN : POLLOUT, 342 .events = read ? POLLIN : POLLOUT,
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 7982cbc3bc94..56eb14c98475 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -24,6 +24,14 @@ config X86
24 bool 24 bool
25 default y 25 default y
26 26
27config GENERIC_TIME
28 bool
29 default y
30
31config GENERIC_TIME_VSYSCALL
32 bool
33 default y
34
27config ZONE_DMA32 35config ZONE_DMA32
28 bool 36 bool
29 default y 37 default y
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index ae399458024b..bb47e86f3d02 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
9 x8664_ksyms.o i387.o syscall.o vsyscall.o \ 9 x8664_ksyms.o i387.o syscall.o vsyscall.o \
10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ 10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
11 pci-dma.o pci-nommu.o alternative.o 11 pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o 14obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/
19obj-$(CONFIG_X86_MSR) += msr.o 19obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o 20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o 21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o 22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o
23obj-y += apic.o nmi.o 23obj-y += apic.o nmi.o
24obj-y += io_apic.o mpparse.o \ 24obj-y += io_apic.o mpparse.o \
25 genapic.o genapic_cluster.o genapic_flat.o 25 genapic.o genapic_cluster.o genapic_flat.o
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 124b2d27b4ac..723417d924c0 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -37,6 +37,7 @@
37#include <asm/idle.h> 37#include <asm/idle.h>
38#include <asm/proto.h> 38#include <asm/proto.h>
39#include <asm/timex.h> 39#include <asm/timex.h>
40#include <asm/hpet.h>
40#include <asm/apic.h> 41#include <asm/apic.h>
41 42
42int apic_mapped; 43int apic_mapped;
@@ -763,7 +764,7 @@ static void setup_APIC_timer(unsigned int clocks)
763 local_irq_save(flags); 764 local_irq_save(flags);
764 765
765 /* wait for irq slice */ 766 /* wait for irq slice */
766 if (vxtime.hpet_address && hpet_use_timer) { 767 if (hpet_address && hpet_use_timer) {
767 int trigger = hpet_readl(HPET_T0_CMP); 768 int trigger = hpet_readl(HPET_T0_CMP);
768 while (hpet_readl(HPET_COUNTER) >= trigger) 769 while (hpet_readl(HPET_COUNTER) >= trigger)
769 /* do nothing */ ; 770 /* do nothing */ ;
@@ -785,7 +786,7 @@ static void setup_APIC_timer(unsigned int clocks)
785 /* Turn off PIT interrupt if we use APIC timer as main timer. 786 /* Turn off PIT interrupt if we use APIC timer as main timer.
786 Only works with the PM timer right now 787 Only works with the PM timer right now
787 TBD fix it for HPET too. */ 788 TBD fix it for HPET too. */
788 if (vxtime.mode == VXTIME_PMTMR && 789 if ((pmtmr_ioport != 0) &&
789 smp_processor_id() == boot_cpu_id && 790 smp_processor_id() == boot_cpu_id &&
790 apic_runs_main_timer == 1 && 791 apic_runs_main_timer == 1 &&
791 !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { 792 !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
diff --git a/arch/i386/kernel/time_hpet.c b/arch/x86_64/kernel/hpet.c
index 1e4702dfcd01..65a0edd71a17 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/x86_64/kernel/hpet.c
@@ -1,224 +1,138 @@
1/*
2 * linux/arch/i386/kernel/time_hpet.c
3 * This code largely copied from arch/x86_64/kernel/time.c
4 * See that file for credits.
5 *
6 * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support
7 */
8
9#include <linux/errno.h>
10#include <linux/kernel.h> 1#include <linux/kernel.h>
11#include <linux/param.h> 2#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/init.h> 3#include <linux/init.h>
14#include <linux/smp.h> 4#include <linux/mc146818rtc.h>
5#include <linux/time.h>
6#include <linux/clocksource.h>
7#include <linux/ioport.h>
8#include <linux/acpi.h>
9#include <linux/hpet.h>
10#include <asm/pgtable.h>
11#include <asm/vsyscall.h>
12#include <asm/timex.h>
13#include <asm/hpet.h>
15 14
16#include <asm/timer.h> 15int nohpet __initdata;
17#include <asm/fixmap.h>
18#include <asm/apic.h>
19 16
20#include <linux/timex.h> 17unsigned long hpet_address;
18unsigned long hpet_period; /* fsecs / HPET clock */
19unsigned long hpet_tick; /* HPET clocks / interrupt */
21 20
22#include <asm/hpet.h> 21int hpet_use_timer; /* Use counter of hpet for time keeping,
23#include <linux/hpet.h> 22 * otherwise PIT
23 */
24 24
25static unsigned long hpet_period; /* fsecs / HPET clock */ 25#ifdef CONFIG_HPET
26unsigned long hpet_tick; /* hpet clks count per tick */ 26static __init int late_hpet_init(void)
27unsigned long hpet_address; /* hpet memory map physical address */ 27{
28int hpet_use_timer; 28 struct hpet_data hd;
29 unsigned int ntimer;
29 30
30static int use_hpet; /* can be used for runtime check of hpet */ 31 if (!hpet_address)
31static int boot_hpet_disable; /* boottime override for HPET timer */ 32 return 0;
32static void __iomem * hpet_virt_address; /* hpet kernel virtual address */
33 33
34#define FSEC_TO_USEC (1000000000UL) 34 memset(&hd, 0, sizeof(hd));
35 35
36int hpet_readl(unsigned long a) 36 ntimer = hpet_readl(HPET_ID);
37{ 37 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
38 return readl(hpet_virt_address + a); 38 ntimer++;
39}
40 39
41static void hpet_writel(unsigned long d, unsigned long a) 40 /*
42{ 41 * Register with driver.
43 writel(d, hpet_virt_address + a); 42 * Timer0 and Timer1 is used by platform.
44} 43 */
44 hd.hd_phys_address = hpet_address;
45 hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
46 hd.hd_nirqs = ntimer;
47 hd.hd_flags = HPET_DATA_PLATFORM;
48 hpet_reserve_timer(&hd, 0);
49#ifdef CONFIG_HPET_EMULATE_RTC
50 hpet_reserve_timer(&hd, 1);
51#endif
52 hd.hd_irq[0] = HPET_LEGACY_8254;
53 hd.hd_irq[1] = HPET_LEGACY_RTC;
54 if (ntimer > 2) {
55 struct hpet *hpet;
56 struct hpet_timer *timer;
57 int i;
58
59 hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
60 timer = &hpet->hpet_timers[2];
61 for (i = 2; i < ntimer; timer++, i++)
62 hd.hd_irq[i] = (timer->hpet_config &
63 Tn_INT_ROUTE_CNF_MASK) >>
64 Tn_INT_ROUTE_CNF_SHIFT;
45 65
46#ifdef CONFIG_X86_LOCAL_APIC 66 }
47/*
48 * HPET counters dont wrap around on every tick. They just change the
49 * comparator value and continue. Next tick can be caught by checking
50 * for a change in the comparator value. Used in apic.c.
51 */
52static void __devinit wait_hpet_tick(void)
53{
54 unsigned int start_cmp_val, end_cmp_val;
55 67
56 start_cmp_val = hpet_readl(HPET_T0_CMP); 68 hpet_alloc(&hd);
57 do { 69 return 0;
58 end_cmp_val = hpet_readl(HPET_T0_CMP);
59 } while (start_cmp_val == end_cmp_val);
60} 70}
71fs_initcall(late_hpet_init);
61#endif 72#endif
62 73
63static int hpet_timer_stop_set_go(unsigned long tick) 74int hpet_timer_stop_set_go(unsigned long tick)
64{ 75{
65 unsigned int cfg; 76 unsigned int cfg;
66 77
67 /* 78/*
68 * Stop the timers and reset the main counter. 79 * Stop the timers and reset the main counter.
69 */ 80 */
81
70 cfg = hpet_readl(HPET_CFG); 82 cfg = hpet_readl(HPET_CFG);
71 cfg &= ~HPET_CFG_ENABLE; 83 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
72 hpet_writel(cfg, HPET_CFG); 84 hpet_writel(cfg, HPET_CFG);
73 hpet_writel(0, HPET_COUNTER); 85 hpet_writel(0, HPET_COUNTER);
74 hpet_writel(0, HPET_COUNTER + 4); 86 hpet_writel(0, HPET_COUNTER + 4);
75 87
88/*
89 * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
90 * and period also hpet_tick.
91 */
76 if (hpet_use_timer) { 92 if (hpet_use_timer) {
77 /* 93 hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
78 * Set up timer 0, as periodic with first interrupt to happen at 94 HPET_TN_32BIT, HPET_T0_CFG);
79 * hpet_tick, and period also hpet_tick. 95 hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
80 */ 96 hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
81 cfg = hpet_readl(HPET_T0_CFG);
82 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
83 HPET_TN_SETVAL | HPET_TN_32BIT;
84 hpet_writel(cfg, HPET_T0_CFG);
85
86 /*
87 * The first write after writing TN_SETVAL to the config register sets
88 * the counter value, the second write sets the threshold.
89 */
90 hpet_writel(tick, HPET_T0_CMP);
91 hpet_writel(tick, HPET_T0_CMP);
92 }
93 /*
94 * Go!
95 */
96 cfg = hpet_readl(HPET_CFG);
97 if (hpet_use_timer)
98 cfg |= HPET_CFG_LEGACY; 97 cfg |= HPET_CFG_LEGACY;
98 }
99/*
100 * Go!
101 */
102
99 cfg |= HPET_CFG_ENABLE; 103 cfg |= HPET_CFG_ENABLE;
100 hpet_writel(cfg, HPET_CFG); 104 hpet_writel(cfg, HPET_CFG);
101 105
102 return 0; 106 return 0;
103} 107}
104 108
105/* 109int hpet_arch_init(void)
106 * Check whether HPET was found by ACPI boot parse. If yes setup HPET
107 * counter 0 for kernel base timer.
108 */
109int __init hpet_enable(void)
110{ 110{
111 unsigned int id; 111 unsigned int id;
112 unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */
113 unsigned long hpet_tick_rem;
114 112
115 if (boot_hpet_disable) 113 if (!hpet_address)
116 return -1; 114 return -1;
115 set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
116 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
117
118/*
119 * Read the period, compute tick and quotient.
120 */
117 121
118 if (!hpet_address) {
119 return -1;
120 }
121 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
122 /*
123 * Read the period, compute tick and quotient.
124 */
125 id = hpet_readl(HPET_ID); 122 id = hpet_readl(HPET_ID);
126 123
127 /* 124 if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
128 * We are checking for value '1' or more in number field if
129 * CONFIG_HPET_EMULATE_RTC is set because we will need an
130 * additional timer for RTC emulation.
131 * However, we can do with one timer otherwise using the
132 * the single HPET timer for system time.
133 */
134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
138 return -1; 125 return -1;
139 }
140#endif
141
142 126
143 hpet_period = hpet_readl(HPET_PERIOD); 127 hpet_period = hpet_readl(HPET_PERIOD);
144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) { 128 if (hpet_period < 100000 || hpet_period > 100000000)
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
147 return -1; 129 return -1;
148 }
149 130
150 /* 131 hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
151 * 64 bit math
152 * First changing tick into fsec
153 * Then 64 bit div to find number of hpet clk per tick
154 */
155 ASM_MUL64_REG(tick_fsec_low, tick_fsec_high,
156 KERNEL_TICK_USEC, FSEC_TO_USEC);
157 ASM_DIV64_REG(hpet_tick, hpet_tick_rem,
158 hpet_period, tick_fsec_low, tick_fsec_high);
159
160 if (hpet_tick_rem > (hpet_period >> 1))
161 hpet_tick++; /* rounding the result */
162
163 hpet_use_timer = id & HPET_ID_LEGSUP;
164
165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
168 return -1;
169 }
170 132
171 use_hpet = 1; 133 hpet_use_timer = (id & HPET_ID_LEGSUP);
172 134
173#ifdef CONFIG_HPET 135 return hpet_timer_stop_set_go(hpet_tick);
174 {
175 struct hpet_data hd;
176 unsigned int ntimer;
177
178 memset(&hd, 0, sizeof (hd));
179
180 ntimer = hpet_readl(HPET_ID);
181 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
182 ntimer++;
183
184 /*
185 * Register with driver.
186 * Timer0 and Timer1 is used by platform.
187 */
188 hd.hd_phys_address = hpet_address;
189 hd.hd_address = hpet_virt_address;
190 hd.hd_nirqs = ntimer;
191 hd.hd_flags = HPET_DATA_PLATFORM;
192 hpet_reserve_timer(&hd, 0);
193#ifdef CONFIG_HPET_EMULATE_RTC
194 hpet_reserve_timer(&hd, 1);
195#endif
196 hd.hd_irq[0] = HPET_LEGACY_8254;
197 hd.hd_irq[1] = HPET_LEGACY_RTC;
198 if (ntimer > 2) {
199 struct hpet __iomem *hpet;
200 struct hpet_timer __iomem *timer;
201 int i;
202
203 hpet = hpet_virt_address;
204
205 for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
206 timer++, i++)
207 hd.hd_irq[i] = (timer->hpet_config &
208 Tn_INT_ROUTE_CNF_MASK) >>
209 Tn_INT_ROUTE_CNF_SHIFT;
210
211 }
212
213 hpet_alloc(&hd);
214 }
215#endif
216
217#ifdef CONFIG_X86_LOCAL_APIC
218 if (hpet_use_timer)
219 wait_timer_tick = wait_hpet_tick;
220#endif
221 return 0;
222} 136}
223 137
224int hpet_reenable(void) 138int hpet_reenable(void)
@@ -226,28 +140,51 @@ int hpet_reenable(void)
226 return hpet_timer_stop_set_go(hpet_tick); 140 return hpet_timer_stop_set_go(hpet_tick);
227} 141}
228 142
229int is_hpet_enabled(void) 143/*
230{ 144 * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
231 return use_hpet; 145 * it to the HPET timer of known frequency.
232} 146 */
233 147
234int is_hpet_capable(void) 148#define TICK_COUNT 100000000
149#define TICK_MIN 5000
150
151/*
152 * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
153 * occurs between the reads of the hpet & TSC.
154 */
155static void __init read_hpet_tsc(int *hpet, int *tsc)
235{ 156{
236 if (!boot_hpet_disable && hpet_address) 157 int tsc1, tsc2, hpet1;
237 return 1; 158
238 return 0; 159 do {
160 tsc1 = get_cycles_sync();
161 hpet1 = hpet_readl(HPET_COUNTER);
162 tsc2 = get_cycles_sync();
163 } while (tsc2 - tsc1 > TICK_MIN);
164 *hpet = hpet1;
165 *tsc = tsc2;
239} 166}
240 167
241static int __init hpet_setup(char* str) 168unsigned int __init hpet_calibrate_tsc(void)
242{ 169{
243 if (str) { 170 int tsc_start, hpet_start;
244 if (!strncmp("disable", str, 7)) 171 int tsc_now, hpet_now;
245 boot_hpet_disable = 1; 172 unsigned long flags;
246 } 173
247 return 1; 174 local_irq_save(flags);
248} 175
176 read_hpet_tsc(&hpet_start, &tsc_start);
249 177
250__setup("hpet=", hpet_setup); 178 do {
179 local_irq_disable();
180 read_hpet_tsc(&hpet_now, &tsc_now);
181 local_irq_restore(flags);
182 } while ((tsc_now - tsc_start) < TICK_COUNT &&
183 (hpet_now - hpet_start) < TICK_COUNT);
184
185 return (tsc_now - tsc_start) * 1000000000L
186 / ((hpet_now - hpet_start) * hpet_period / 1000);
187}
251 188
252#ifdef CONFIG_HPET_EMULATE_RTC 189#ifdef CONFIG_HPET_EMULATE_RTC
253/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET 190/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
@@ -264,7 +201,6 @@ __setup("hpet=", hpet_setup);
264 * For (3), we use interrupts at 64Hz or user specified periodic 201 * For (3), we use interrupts at 64Hz or user specified periodic
265 * frequency, whichever is higher. 202 * frequency, whichever is higher.
266 */ 203 */
267#include <linux/mc146818rtc.h>
268#include <linux/rtc.h> 204#include <linux/rtc.h>
269 205
270#define DEFAULT_RTC_INT_FREQ 64 206#define DEFAULT_RTC_INT_FREQ 64
@@ -283,6 +219,11 @@ static unsigned long PIE_count;
283static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ 219static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
284static unsigned int hpet_t1_cmp; /* cached comparator register */ 220static unsigned int hpet_t1_cmp; /* cached comparator register */
285 221
222int is_hpet_enabled(void)
223{
224 return hpet_address != 0;
225}
226
286/* 227/*
287 * Timer 1 for RTC, we do not use periodic interrupt feature, 228 * Timer 1 for RTC, we do not use periodic interrupt feature,
288 * even if HPET supports periodic interrupts on Timer 1. 229 * even if HPET supports periodic interrupts on Timer 1.
@@ -367,8 +308,9 @@ static void hpet_rtc_timer_reinit(void)
367 if (PIE_on) 308 if (PIE_on)
368 PIE_count += lost_ints; 309 PIE_count += lost_ints;
369 310
370 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", 311 if (printk_ratelimit())
371 hpet_rtc_int_freq); 312 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
313 hpet_rtc_int_freq);
372 } 314 }
373} 315}
374 316
@@ -450,7 +392,7 @@ int hpet_rtc_dropped_irq(void)
450 return 1; 392 return 1;
451} 393}
452 394
453irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) 395irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
454{ 396{
455 struct rtc_time curr_time; 397 struct rtc_time curr_time;
456 unsigned long rtc_int_flag = 0; 398 unsigned long rtc_int_flag = 0;
@@ -495,3 +437,75 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
495} 437}
496#endif 438#endif
497 439
440static int __init nohpet_setup(char *s)
441{
442 nohpet = 1;
443 return 1;
444}
445
446__setup("nohpet", nohpet_setup);
447
448#define HPET_MASK 0xFFFFFFFF
449#define HPET_SHIFT 22
450
451/* FSEC = 10^-15 NSEC = 10^-9 */
452#define FSEC_PER_NSEC 1000000
453
454static void *hpet_ptr;
455
456static cycle_t read_hpet(void)
457{
458 return (cycle_t)readl(hpet_ptr);
459}
460
461static cycle_t __vsyscall_fn vread_hpet(void)
462{
463 return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
464}
465
466struct clocksource clocksource_hpet = {
467 .name = "hpet",
468 .rating = 250,
469 .read = read_hpet,
470 .mask = (cycle_t)HPET_MASK,
471 .mult = 0, /* set below */
472 .shift = HPET_SHIFT,
473 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
474 .vread = vread_hpet,
475};
476
477static int __init init_hpet_clocksource(void)
478{
479 unsigned long hpet_period;
480 void __iomem *hpet_base;
481 u64 tmp;
482
483 if (!hpet_address)
484 return -ENODEV;
485
486 /* calculate the hpet address: */
487 hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
488 hpet_ptr = hpet_base + HPET_COUNTER;
489
490 /* calculate the frequency: */
491 hpet_period = readl(hpet_base + HPET_PERIOD);
492
493 /*
494 * hpet period is in femto seconds per cycle
495 * so we need to convert this to ns/cyc units
496 * aproximated by mult/2^shift
497 *
498 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
499 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
500 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
501 * (fsec/cyc << shift)/1000000 = mult
502 * (hpet_period << shift)/FSEC_PER_NSEC = mult
503 */
504 tmp = (u64)hpet_period << HPET_SHIFT;
505 do_div(tmp, FSEC_PER_NSEC);
506 clocksource_hpet.mult = (u32)tmp;
507
508 return clocksource_register(&clocksource_hpet);
509}
510
511module_init(init_hpet_clocksource);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index d73c79e821f1..01e2cf0bdeb1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -103,6 +103,7 @@ static void mask_and_ack_8259A(unsigned int);
103static struct irq_chip i8259A_chip = { 103static struct irq_chip i8259A_chip = {
104 .name = "XT-PIC", 104 .name = "XT-PIC",
105 .mask = disable_8259A_irq, 105 .mask = disable_8259A_irq,
106 .disable = disable_8259A_irq,
106 .unmask = enable_8259A_irq, 107 .unmask = enable_8259A_irq,
107 .mask_ack = mask_and_ack_8259A, 108 .mask_ack = mask_and_ack_8259A,
108}; 109};
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 566e64d966c4..950682f35766 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -810,11 +810,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
810 trigger == IOAPIC_LEVEL) 810 trigger == IOAPIC_LEVEL)
811 set_irq_chip_and_handler_name(irq, &ioapic_chip, 811 set_irq_chip_and_handler_name(irq, &ioapic_chip,
812 handle_fasteoi_irq, "fasteoi"); 812 handle_fasteoi_irq, "fasteoi");
813 else { 813 else
814 irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
815 set_irq_chip_and_handler_name(irq, &ioapic_chip, 814 set_irq_chip_and_handler_name(irq, &ioapic_chip,
816 handle_edge_irq, "edge"); 815 handle_edge_irq, "edge");
817 }
818} 816}
819static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) 817static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
820{ 818{
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
index 7554458dc9cb..ae8f91214f15 100644
--- a/arch/x86_64/kernel/pmtimer.c
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -24,15 +24,6 @@
24#include <asm/msr.h> 24#include <asm/msr.h>
25#include <asm/vsyscall.h> 25#include <asm/vsyscall.h>
26 26
27/* The I/O port the PMTMR resides at.
28 * The location is detected during setup_arch(),
29 * in arch/i386/kernel/acpi/boot.c */
30u32 pmtmr_ioport __read_mostly;
31
32/* value of the Power timer at last timer interrupt */
33static u32 offset_delay;
34static u32 last_pmtmr_tick;
35
36#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ 27#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
37 28
38static inline u32 cyc2us(u32 cycles) 29static inline u32 cyc2us(u32 cycles)
@@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles)
48 return (cycles >> 10); 39 return (cycles >> 10);
49} 40}
50 41
51int pmtimer_mark_offset(void)
52{
53 static int first_run = 1;
54 unsigned long tsc;
55 u32 lost;
56
57 u32 tick = inl(pmtmr_ioport);
58 u32 delta;
59
60 delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
61
62 last_pmtmr_tick = tick;
63 monotonic_base += delta * NSEC_PER_USEC;
64
65 delta += offset_delay;
66
67 lost = delta / (USEC_PER_SEC / HZ);
68 offset_delay = delta % (USEC_PER_SEC / HZ);
69
70 rdtscll(tsc);
71 vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000;
72
73 /* don't calculate delay for first run,
74 or if we've got less then a tick */
75 if (first_run || (lost < 1)) {
76 first_run = 0;
77 offset_delay = 0;
78 }
79
80 return lost - 1;
81}
82
83static unsigned pmtimer_wait_tick(void) 42static unsigned pmtimer_wait_tick(void)
84{ 43{
85 u32 a, b; 44 u32 a, b;
@@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us)
101 } while (cyc2us(b - a) < us); 60 } while (cyc2us(b - a) < us);
102} 61}
103 62
104void pmtimer_resume(void)
105{
106 last_pmtmr_tick = inl(pmtmr_ioport);
107}
108
109unsigned int do_gettimeoffset_pm(void)
110{
111 u32 now, offset, delta = 0;
112
113 offset = last_pmtmr_tick;
114 now = inl(pmtmr_ioport);
115 delta = (now - offset) & ACPI_PM_MASK;
116
117 return offset_delay + cyc2us(delta);
118}
119
120
121static int __init nopmtimer_setup(char *s) 63static int __init nopmtimer_setup(char *s)
122{ 64{
123 pmtmr_ioport = 0; 65 pmtmr_ioport = 0;
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index daf19332f0dd..35443729aad8 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id)
148 print_cpu_info(c); 148 print_cpu_info(c);
149} 149}
150 150
151/*
152 * New Funky TSC sync algorithm borrowed from IA64.
153 * Main advantage is that it doesn't reset the TSCs fully and
154 * in general looks more robust and it works better than my earlier
155 * attempts. I believe it was written by David Mosberger. Some minor
156 * adjustments for x86-64 by me -AK
157 *
158 * Original comment reproduced below.
159 *
160 * Synchronize TSC of the current (slave) CPU with the TSC of the
161 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
162 * eliminate the possibility of unaccounted-for errors (such as
163 * getting a machine check in the middle of a calibration step). The
164 * basic idea is for the slave to ask the master what itc value it has
165 * and to read its own itc before and after the master responds. Each
166 * iteration gives us three timestamps:
167 *
168 * slave master
169 *
170 * t0 ---\
171 * ---\
172 * --->
173 * tm
174 * /---
175 * /---
176 * t1 <---
177 *
178 *
179 * The goal is to adjust the slave's TSC such that tm falls exactly
180 * half-way between t0 and t1. If we achieve this, the clocks are
181 * synchronized provided the interconnect between the slave and the
182 * master is symmetric. Even if the interconnect were asymmetric, we
183 * would still know that the synchronization error is smaller than the
184 * roundtrip latency (t0 - t1).
185 *
186 * When the interconnect is quiet and symmetric, this lets us
187 * synchronize the TSC to within one or two cycles. However, we can
188 * only *guarantee* that the synchronization is accurate to within a
189 * round-trip time, which is typically in the range of several hundred
190 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
191 * are usually almost perfectly synchronized, but we shouldn't assume
192 * that the accuracy is much better than half a micro second or so.
193 *
194 * [there are other errors like the latency of RDTSC and of the
195 * WRMSR. These can also account to hundreds of cycles. So it's
196 * probably worse. It claims 153 cycles error on a dual Opteron,
197 * but I suspect the numbers are actually somewhat worse -AK]
198 */
199
200#define MASTER 0
201#define SLAVE (SMP_CACHE_BYTES/8)
202
203/* Intentionally don't use cpu_relax() while TSC synchronization
204 because we don't want to go into funky power save modi or cause
205 hypervisors to schedule us away. Going to sleep would likely affect
206 latency and low latency is the primary objective here. -AK */
207#define no_cpu_relax() barrier()
208
209static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
210static volatile __cpuinitdata unsigned long go[SLAVE + 1];
211static int notscsync __cpuinitdata;
212
213#undef DEBUG_TSC_SYNC
214
215#define NUM_ROUNDS 64 /* magic value */
216#define NUM_ITERS 5 /* likewise */
217
218/* Callback on boot CPU */
219static __cpuinit void sync_master(void *arg)
220{
221 unsigned long flags, i;
222
223 go[MASTER] = 0;
224
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235}
236
237/*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242static inline long
243get_delta(long *rt, long *master)
244{
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
248
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
256
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
260
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
263
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269}
270
271static __cpuinit void sync_tsc(unsigned int master)
272{
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276#ifdef DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283#endif
284
285 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
286 smp_processor_id(), master);
287
288 go[MASTER] = 1;
289
290 /* It is dangerous to broadcast IPI as cpus are coming up,
291 * as they may not be ready to accept them. So since
292 * we only need to send the ipi to the boot cpu direct
293 * the message, and avoid the race.
294 */
295 smp_call_function_single(master, sync_master, NULL, 1, 0);
296
297 while (go[MASTER]) /* wait for master to be ready */
298 no_cpu_relax();
299
300 spin_lock_irqsave(&tsc_sync_lock, flags);
301 {
302 for (i = 0; i < NUM_ROUNDS; ++i) {
303 delta = get_delta(&rt, &master_time_stamp);
304 if (delta == 0) {
305 done = 1; /* let's lock on to this... */
306 bound = rt;
307 }
308
309 if (!done) {
310 unsigned long t;
311 if (i > 0) {
312 adjust_latency += -delta;
313 adj = -delta + adjust_latency/4;
314 } else
315 adj = -delta;
316
317 rdtscll(t);
318 wrmsrl(MSR_IA32_TSC, t + adj);
319 }
320#ifdef DEBUG_TSC_SYNC
321 t[i].rt = rt;
322 t[i].master = master_time_stamp;
323 t[i].diff = delta;
324 t[i].lat = adjust_latency/4;
325#endif
326 }
327 }
328 spin_unlock_irqrestore(&tsc_sync_lock, flags);
329
330#ifdef DEBUG_TSC_SYNC
331 for (i = 0; i < NUM_ROUNDS; ++i)
332 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
333 t[i].rt, t[i].master, t[i].diff, t[i].lat);
334#endif
335
336 printk(KERN_INFO
337 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
338 "maxerr %lu cycles)\n",
339 smp_processor_id(), master, delta, rt);
340}
341
342static void __cpuinit tsc_sync_wait(void)
343{
344 /*
345 * When the CPU has synchronized TSCs assume the BIOS
346 * or the hardware already synced. Otherwise we could
347 * mess up a possible perfect synchronization with a
348 * not-quite-perfect algorithm.
349 */
350 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
351 return;
352 sync_tsc(0);
353}
354
355static __init int notscsync_setup(char *s)
356{
357 notscsync = 1;
358 return 1;
359}
360__setup("notscsync", notscsync_setup);
361
362static atomic_t init_deasserted __cpuinitdata; 151static atomic_t init_deasserted __cpuinitdata;
363 152
364/* 153/*
@@ -546,6 +335,11 @@ void __cpuinit start_secondary(void)
546 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 335 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
547 barrier(); 336 barrier();
548 337
338 /*
339 * Check TSC sync first:
340 */
341 check_tsc_sync_target();
342
549 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 343 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
550 setup_secondary_APIC_clock(); 344 setup_secondary_APIC_clock();
551 345
@@ -565,14 +359,6 @@ void __cpuinit start_secondary(void)
565 */ 359 */
566 set_cpu_sibling_map(smp_processor_id()); 360 set_cpu_sibling_map(smp_processor_id());
567 361
568 /*
569 * Wait for TSC sync to not schedule things before.
570 * We still process interrupts, which could see an inconsistent
571 * time in that window unfortunately.
572 * Do this here because TSC sync has global unprotected state.
573 */
574 tsc_sync_wait();
575
576 /* 362 /*
577 * We need to hold call_lock, so there is no inconsistency 363 * We need to hold call_lock, so there is no inconsistency
578 * between the time smp_call_function() determines number of 364 * between the time smp_call_function() determines number of
@@ -592,6 +378,7 @@ void __cpuinit start_secondary(void)
592 cpu_set(smp_processor_id(), cpu_online_map); 378 cpu_set(smp_processor_id(), cpu_online_map);
593 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 379 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
594 spin_unlock(&vector_lock); 380 spin_unlock(&vector_lock);
381
595 unlock_ipi_call_lock(); 382 unlock_ipi_call_lock();
596 383
597 cpu_idle(); 384 cpu_idle();
@@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
1168 /* Unleash the CPU! */ 955 /* Unleash the CPU! */
1169 Dprintk("waiting for cpu %d\n", cpu); 956 Dprintk("waiting for cpu %d\n", cpu);
1170 957
958 /*
959 * Make sure and check TSC sync:
960 */
961 check_tsc_sync_source(cpu);
962
1171 while (!cpu_isset(cpu, cpu_online_map)) 963 while (!cpu_isset(cpu, cpu_online_map))
1172 cpu_relax(); 964 cpu_relax();
1173 965
@@ -1190,7 +982,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
1190 smp_cleanup_boot(); 982 smp_cleanup_boot();
1191 setup_ioapic_dest(); 983 setup_ioapic_dest();
1192 check_nmi_watchdog(); 984 check_nmi_watchdog();
1193 time_init_gtod();
1194} 985}
1195 986
1196#ifdef CONFIG_HOTPLUG_CPU 987#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 3cc6886f1fb7..a87c51705620 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,8 @@
42#include <linux/cpufreq.h> 42#include <linux/cpufreq.h>
43#include <linux/hpet.h> 43#include <linux/hpet.h>
44#include <asm/apic.h> 44#include <asm/apic.h>
45#include <asm/hpet.h>
45 46
46#ifdef CONFIG_CPU_FREQ
47static void cpufreq_delayed_get(void);
48#endif
49extern void i8254_timer_resume(void); 47extern void i8254_timer_resume(void);
50extern int using_apic_timer; 48extern int using_apic_timer;
51 49
@@ -55,128 +53,7 @@ DEFINE_SPINLOCK(rtc_lock);
55EXPORT_SYMBOL(rtc_lock); 53EXPORT_SYMBOL(rtc_lock);
56DEFINE_SPINLOCK(i8253_lock); 54DEFINE_SPINLOCK(i8253_lock);
57 55
58int nohpet __initdata = 0;
59static int notsc __initdata = 0;
60
61#define USEC_PER_TICK (USEC_PER_SEC / HZ)
62#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
63#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
64
65#define NS_SCALE 10 /* 2^10, carefully chosen */
66#define US_SCALE 32 /* 2^32, arbitralrily chosen */
67
68unsigned int cpu_khz; /* TSC clocks / usec, not used here */
69EXPORT_SYMBOL(cpu_khz);
70static unsigned long hpet_period; /* fsecs / HPET clock */
71unsigned long hpet_tick; /* HPET clocks / interrupt */
72int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */
73unsigned long vxtime_hz = PIT_TICK_RATE;
74int report_lost_ticks; /* command line option */
75unsigned long long monotonic_base;
76
77struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
78
79volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 56volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
80struct timespec __xtime __section_xtime;
81struct timezone __sys_tz __section_sys_tz;
82
83/*
84 * do_gettimeoffset() returns microseconds since last timer interrupt was
85 * triggered by hardware. A memory read of HPET is slower than a register read
86 * of TSC, but much more reliable. It's also synchronized to the timer
87 * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
88 * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
89 * This is not a problem, because jiffies hasn't updated either. They are bound
90 * together by xtime_lock.
91 */
92
93static inline unsigned int do_gettimeoffset_tsc(void)
94{
95 unsigned long t;
96 unsigned long x;
97 t = get_cycles_sync();
98 if (t < vxtime.last_tsc)
99 t = vxtime.last_tsc; /* hack */
100 x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
101 return x;
102}
103
104static inline unsigned int do_gettimeoffset_hpet(void)
105{
106 /* cap counter read to one tick to avoid inconsistencies */
107 unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
108 return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
109}
110
111unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
112
113/*
114 * This version of gettimeofday() has microsecond resolution and better than
115 * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
116 * MHz) HPET timer.
117 */
118
119void do_gettimeofday(struct timeval *tv)
120{
121 unsigned long seq;
122 unsigned int sec, usec;
123
124 do {
125 seq = read_seqbegin(&xtime_lock);
126
127 sec = xtime.tv_sec;
128 usec = xtime.tv_nsec / NSEC_PER_USEC;
129
130 /* i386 does some correction here to keep the clock
131 monotonous even when ntpd is fixing drift.
132 But they didn't work for me, there is a non monotonic
133 clock anyways with ntp.
134 I dropped all corrections now until a real solution can
135 be found. Note when you fix it here you need to do the same
136 in arch/x86_64/kernel/vsyscall.c and export all needed
137 variables in vmlinux.lds. -AK */
138 usec += do_gettimeoffset();
139
140 } while (read_seqretry(&xtime_lock, seq));
141
142 tv->tv_sec = sec + usec / USEC_PER_SEC;
143 tv->tv_usec = usec % USEC_PER_SEC;
144}
145
146EXPORT_SYMBOL(do_gettimeofday);
147
148/*
149 * settimeofday() first undoes the correction that gettimeofday would do
150 * on the time, and then saves it. This is ugly, but has been like this for
151 * ages already.
152 */
153
154int do_settimeofday(struct timespec *tv)
155{
156 time_t wtm_sec, sec = tv->tv_sec;
157 long wtm_nsec, nsec = tv->tv_nsec;
158
159 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
160 return -EINVAL;
161
162 write_seqlock_irq(&xtime_lock);
163
164 nsec -= do_gettimeoffset() * NSEC_PER_USEC;
165
166 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
167 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
168
169 set_normalized_timespec(&xtime, sec, nsec);
170 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
171
172 ntp_clear();
173
174 write_sequnlock_irq(&xtime_lock);
175 clock_was_set();
176 return 0;
177}
178
179EXPORT_SYMBOL(do_settimeofday);
180 57
181unsigned long profile_pc(struct pt_regs *regs) 58unsigned long profile_pc(struct pt_regs *regs)
182{ 59{
@@ -267,84 +144,9 @@ static void set_rtc_mmss(unsigned long nowtime)
267} 144}
268 145
269 146
270/* monotonic_clock(): returns # of nanoseconds passed since time_init()
271 * Note: This function is required to return accurate
272 * time even in the absence of multiple timer ticks.
273 */
274static inline unsigned long long cycles_2_ns(unsigned long long cyc);
275unsigned long long monotonic_clock(void)
276{
277 unsigned long seq;
278 u32 last_offset, this_offset, offset;
279 unsigned long long base;
280
281 if (vxtime.mode == VXTIME_HPET) {
282 do {
283 seq = read_seqbegin(&xtime_lock);
284
285 last_offset = vxtime.last;
286 base = monotonic_base;
287 this_offset = hpet_readl(HPET_COUNTER);
288 } while (read_seqretry(&xtime_lock, seq));
289 offset = (this_offset - last_offset);
290 offset *= NSEC_PER_TICK / hpet_tick;
291 } else {
292 do {
293 seq = read_seqbegin(&xtime_lock);
294
295 last_offset = vxtime.last_tsc;
296 base = monotonic_base;
297 } while (read_seqretry(&xtime_lock, seq));
298 this_offset = get_cycles_sync();
299 offset = cycles_2_ns(this_offset - last_offset);
300 }
301 return base + offset;
302}
303EXPORT_SYMBOL(monotonic_clock);
304
305static noinline void handle_lost_ticks(int lost)
306{
307 static long lost_count;
308 static int warned;
309 if (report_lost_ticks) {
310 printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost);
311 print_symbol("rip %s)\n", get_irq_regs()->rip);
312 }
313
314 if (lost_count == 1000 && !warned) {
315 printk(KERN_WARNING "warning: many lost ticks.\n"
316 KERN_WARNING "Your time source seems to be instable or "
317 "some driver is hogging interupts\n");
318 print_symbol("rip %s\n", get_irq_regs()->rip);
319 if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
320 printk(KERN_WARNING "Falling back to HPET\n");
321 if (hpet_use_timer)
322 vxtime.last = hpet_readl(HPET_T0_CMP) -
323 hpet_tick;
324 else
325 vxtime.last = hpet_readl(HPET_COUNTER);
326 vxtime.mode = VXTIME_HPET;
327 do_gettimeoffset = do_gettimeoffset_hpet;
328 }
329 /* else should fall back to PIT, but code missing. */
330 warned = 1;
331 } else
332 lost_count++;
333
334#ifdef CONFIG_CPU_FREQ
335 /* In some cases the CPU can change frequency without us noticing
336 Give cpufreq a change to catch up. */
337 if ((lost_count+1) % 25 == 0)
338 cpufreq_delayed_get();
339#endif
340}
341
342void main_timer_handler(void) 147void main_timer_handler(void)
343{ 148{
344 static unsigned long rtc_update = 0; 149 static unsigned long rtc_update = 0;
345 unsigned long tsc;
346 int delay = 0, offset = 0, lost = 0;
347
348/* 150/*
349 * Here we are in the timer irq handler. We have irqs locally disabled (so we 151 * Here we are in the timer irq handler. We have irqs locally disabled (so we
350 * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running 152 * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -354,72 +156,11 @@ void main_timer_handler(void)
354 156
355 write_seqlock(&xtime_lock); 157 write_seqlock(&xtime_lock);
356 158
357 if (vxtime.hpet_address)
358 offset = hpet_readl(HPET_COUNTER);
359
360 if (hpet_use_timer) {
361 /* if we're using the hpet timer functionality,
362 * we can more accurately know the counter value
363 * when the timer interrupt occured.
364 */
365 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
366 delay = hpet_readl(HPET_COUNTER) - offset;
367 } else if (!pmtmr_ioport) {
368 spin_lock(&i8253_lock);
369 outb_p(0x00, 0x43);
370 delay = inb_p(0x40);
371 delay |= inb(0x40) << 8;
372 spin_unlock(&i8253_lock);
373 delay = LATCH - 1 - delay;
374 }
375
376 tsc = get_cycles_sync();
377
378 if (vxtime.mode == VXTIME_HPET) {
379 if (offset - vxtime.last > hpet_tick) {
380 lost = (offset - vxtime.last) / hpet_tick - 1;
381 }
382
383 monotonic_base +=
384 (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
385
386 vxtime.last = offset;
387#ifdef CONFIG_X86_PM_TIMER
388 } else if (vxtime.mode == VXTIME_PMTMR) {
389 lost = pmtimer_mark_offset();
390#endif
391 } else {
392 offset = (((tsc - vxtime.last_tsc) *
393 vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
394
395 if (offset < 0)
396 offset = 0;
397
398 if (offset > USEC_PER_TICK) {
399 lost = offset / USEC_PER_TICK;
400 offset %= USEC_PER_TICK;
401 }
402
403 monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
404
405 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
406
407 if ((((tsc - vxtime.last_tsc) *
408 vxtime.tsc_quot) >> US_SCALE) < offset)
409 vxtime.last_tsc = tsc -
410 (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
411 }
412
413 if (lost > 0)
414 handle_lost_ticks(lost);
415 else
416 lost = 0;
417
418/* 159/*
419 * Do the timer stuff. 160 * Do the timer stuff.
420 */ 161 */
421 162
422 do_timer(lost + 1); 163 do_timer(1);
423#ifndef CONFIG_SMP 164#ifndef CONFIG_SMP
424 update_process_times(user_mode(get_irq_regs())); 165 update_process_times(user_mode(get_irq_regs()));
425#endif 166#endif
@@ -460,40 +201,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
460 return IRQ_HANDLED; 201 return IRQ_HANDLED;
461} 202}
462 203
463static unsigned int cyc2ns_scale __read_mostly;
464
465static inline void set_cyc2ns_scale(unsigned long cpu_khz)
466{
467 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
468}
469
470static inline unsigned long long cycles_2_ns(unsigned long long cyc)
471{
472 return (cyc * cyc2ns_scale) >> NS_SCALE;
473}
474
475unsigned long long sched_clock(void)
476{
477 unsigned long a = 0;
478
479#if 0
480 /* Don't do a HPET read here. Using TSC always is much faster
481 and HPET may not be mapped yet when the scheduler first runs.
482 Disadvantage is a small drift between CPUs in some configurations,
483 but that should be tolerable. */
484 if (__vxtime.mode == VXTIME_HPET)
485 return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
486#endif
487
488 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
489 which means it is not completely exact and may not be monotonous between
490 CPUs. But the errors should be too small to matter for scheduling
491 purposes. */
492
493 rdtscll(a);
494 return cycles_2_ns(a);
495}
496
497static unsigned long get_cmos_time(void) 204static unsigned long get_cmos_time(void)
498{ 205{
499 unsigned int year, mon, day, hour, min, sec; 206 unsigned int year, mon, day, hour, min, sec;
@@ -545,164 +252,6 @@ static unsigned long get_cmos_time(void)
545 return mktime(year, mon, day, hour, min, sec); 252 return mktime(year, mon, day, hour, min, sec);
546} 253}
547 254
548#ifdef CONFIG_CPU_FREQ
549
550/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
551 changes.
552
553 RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
554 not that important because current Opteron setups do not support
555 scaling on SMP anyroads.
556
557 Should fix up last_tsc too. Currently gettimeofday in the
558 first tick after the change will be slightly wrong. */
559
560#include <linux/workqueue.h>
561
562static unsigned int cpufreq_delayed_issched = 0;
563static unsigned int cpufreq_init = 0;
564static struct work_struct cpufreq_delayed_get_work;
565
566static void handle_cpufreq_delayed_get(struct work_struct *v)
567{
568 unsigned int cpu;
569 for_each_online_cpu(cpu) {
570 cpufreq_get(cpu);
571 }
572 cpufreq_delayed_issched = 0;
573}
574
575/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
576 * to verify the CPU frequency the timing core thinks the CPU is running
577 * at is still correct.
578 */
579static void cpufreq_delayed_get(void)
580{
581 static int warned;
582 if (cpufreq_init && !cpufreq_delayed_issched) {
583 cpufreq_delayed_issched = 1;
584 if (!warned) {
585 warned = 1;
586 printk(KERN_DEBUG
587 "Losing some ticks... checking if CPU frequency changed.\n");
588 }
589 schedule_work(&cpufreq_delayed_get_work);
590 }
591}
592
593static unsigned int ref_freq = 0;
594static unsigned long loops_per_jiffy_ref = 0;
595
596static unsigned long cpu_khz_ref = 0;
597
598static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
599 void *data)
600{
601 struct cpufreq_freqs *freq = data;
602 unsigned long *lpj, dummy;
603
604 if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
605 return 0;
606
607 lpj = &dummy;
608 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
609#ifdef CONFIG_SMP
610 lpj = &cpu_data[freq->cpu].loops_per_jiffy;
611#else
612 lpj = &boot_cpu_data.loops_per_jiffy;
613#endif
614
615 if (!ref_freq) {
616 ref_freq = freq->old;
617 loops_per_jiffy_ref = *lpj;
618 cpu_khz_ref = cpu_khz;
619 }
620 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
621 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
622 (val == CPUFREQ_RESUMECHANGE)) {
623 *lpj =
624 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
625
626 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
627 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
628 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
629 }
630
631 set_cyc2ns_scale(cpu_khz_ref);
632
633 return 0;
634}
635
636static struct notifier_block time_cpufreq_notifier_block = {
637 .notifier_call = time_cpufreq_notifier
638};
639
640static int __init cpufreq_tsc(void)
641{
642 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
643 if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
644 CPUFREQ_TRANSITION_NOTIFIER))
645 cpufreq_init = 1;
646 return 0;
647}
648
649core_initcall(cpufreq_tsc);
650
651#endif
652
653/*
654 * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
655 * it to the HPET timer of known frequency.
656 */
657
658#define TICK_COUNT 100000000
659#define TICK_MIN 5000
660#define MAX_READ_RETRIES 5
661
662/*
663 * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
664 * occurs between the reads of the hpet & TSC.
665 */
666static void __init read_hpet_tsc(int *hpet, int *tsc)
667{
668 int tsc1, tsc2, hpet1, retries = 0;
669 static int msg;
670
671 do {
672 tsc1 = get_cycles_sync();
673 hpet1 = hpet_readl(HPET_COUNTER);
674 tsc2 = get_cycles_sync();
675 } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES);
676 if (retries >= MAX_READ_RETRIES && !msg++)
677 printk(KERN_WARNING
678 "hpet.c: exceeded max retries to read HPET & TSC\n");
679 *hpet = hpet1;
680 *tsc = tsc2;
681}
682
683
684static unsigned int __init hpet_calibrate_tsc(void)
685{
686 int tsc_start, hpet_start;
687 int tsc_now, hpet_now;
688 unsigned long flags;
689
690 local_irq_save(flags);
691 local_irq_disable();
692
693 read_hpet_tsc(&hpet_start, &tsc_start);
694
695 do {
696 local_irq_disable();
697 read_hpet_tsc(&hpet_now, &tsc_now);
698 local_irq_restore(flags);
699 } while ((tsc_now - tsc_start) < TICK_COUNT &&
700 (hpet_now - hpet_start) < TICK_COUNT);
701
702 return (tsc_now - tsc_start) * 1000000000L
703 / ((hpet_now - hpet_start) * hpet_period / 1000);
704}
705
706 255
707/* 256/*
708 * pit_calibrate_tsc() uses the speaker output (channel 2) of 257 * pit_calibrate_tsc() uses the speaker output (channel 2) of
@@ -733,124 +282,6 @@ static unsigned int __init pit_calibrate_tsc(void)
733 return (end - start) / 50; 282 return (end - start) / 50;
734} 283}
735 284
736#ifdef CONFIG_HPET
737static __init int late_hpet_init(void)
738{
739 struct hpet_data hd;
740 unsigned int ntimer;
741
742 if (!vxtime.hpet_address)
743 return 0;
744
745 memset(&hd, 0, sizeof (hd));
746
747 ntimer = hpet_readl(HPET_ID);
748 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
749 ntimer++;
750
751 /*
752 * Register with driver.
753 * Timer0 and Timer1 is used by platform.
754 */
755 hd.hd_phys_address = vxtime.hpet_address;
756 hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
757 hd.hd_nirqs = ntimer;
758 hd.hd_flags = HPET_DATA_PLATFORM;
759 hpet_reserve_timer(&hd, 0);
760#ifdef CONFIG_HPET_EMULATE_RTC
761 hpet_reserve_timer(&hd, 1);
762#endif
763 hd.hd_irq[0] = HPET_LEGACY_8254;
764 hd.hd_irq[1] = HPET_LEGACY_RTC;
765 if (ntimer > 2) {
766 struct hpet *hpet;
767 struct hpet_timer *timer;
768 int i;
769
770 hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
771 timer = &hpet->hpet_timers[2];
772 for (i = 2; i < ntimer; timer++, i++)
773 hd.hd_irq[i] = (timer->hpet_config &
774 Tn_INT_ROUTE_CNF_MASK) >>
775 Tn_INT_ROUTE_CNF_SHIFT;
776
777 }
778
779 hpet_alloc(&hd);
780 return 0;
781}
782fs_initcall(late_hpet_init);
783#endif
784
785static int hpet_timer_stop_set_go(unsigned long tick)
786{
787 unsigned int cfg;
788
789/*
790 * Stop the timers and reset the main counter.
791 */
792
793 cfg = hpet_readl(HPET_CFG);
794 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
795 hpet_writel(cfg, HPET_CFG);
796 hpet_writel(0, HPET_COUNTER);
797 hpet_writel(0, HPET_COUNTER + 4);
798
799/*
800 * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
801 * and period also hpet_tick.
802 */
803 if (hpet_use_timer) {
804 hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
805 HPET_TN_32BIT, HPET_T0_CFG);
806 hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
807 hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
808 cfg |= HPET_CFG_LEGACY;
809 }
810/*
811 * Go!
812 */
813
814 cfg |= HPET_CFG_ENABLE;
815 hpet_writel(cfg, HPET_CFG);
816
817 return 0;
818}
819
820static int hpet_init(void)
821{
822 unsigned int id;
823
824 if (!vxtime.hpet_address)
825 return -1;
826 set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
827 __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
828
829/*
830 * Read the period, compute tick and quotient.
831 */
832
833 id = hpet_readl(HPET_ID);
834
835 if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
836 return -1;
837
838 hpet_period = hpet_readl(HPET_PERIOD);
839 if (hpet_period < 100000 || hpet_period > 100000000)
840 return -1;
841
842 hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
843
844 hpet_use_timer = (id & HPET_ID_LEGSUP);
845
846 return hpet_timer_stop_set_go(hpet_tick);
847}
848
849static int hpet_reenable(void)
850{
851 return hpet_timer_stop_set_go(hpet_tick);
852}
853
854#define PIT_MODE 0x43 285#define PIT_MODE 0x43
855#define PIT_CH0 0x40 286#define PIT_CH0 0x40
856 287
@@ -878,7 +309,7 @@ void __init pit_stop_interrupt(void)
878void __init stop_timer_interrupt(void) 309void __init stop_timer_interrupt(void)
879{ 310{
880 char *name; 311 char *name;
881 if (vxtime.hpet_address) { 312 if (hpet_address) {
882 name = "HPET"; 313 name = "HPET";
883 hpet_timer_stop_set_go(0); 314 hpet_timer_stop_set_go(0);
884 } else { 315 } else {
@@ -888,12 +319,6 @@ void __init stop_timer_interrupt(void)
888 printk(KERN_INFO "timer: %s interrupt stopped.\n", name); 319 printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
889} 320}
890 321
891int __init time_setup(char *str)
892{
893 report_lost_ticks = 1;
894 return 1;
895}
896
897static struct irqaction irq0 = { 322static struct irqaction irq0 = {
898 timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL 323 timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
899}; 324};
@@ -901,124 +326,41 @@ static struct irqaction irq0 = {
901void __init time_init(void) 326void __init time_init(void)
902{ 327{
903 if (nohpet) 328 if (nohpet)
904 vxtime.hpet_address = 0; 329 hpet_address = 0;
905
906 xtime.tv_sec = get_cmos_time(); 330 xtime.tv_sec = get_cmos_time();
907 xtime.tv_nsec = 0; 331 xtime.tv_nsec = 0;
908 332
909 set_normalized_timespec(&wall_to_monotonic, 333 set_normalized_timespec(&wall_to_monotonic,
910 -xtime.tv_sec, -xtime.tv_nsec); 334 -xtime.tv_sec, -xtime.tv_nsec);
911 335
912 if (!hpet_init()) 336 if (hpet_arch_init())
913 vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; 337 hpet_address = 0;
914 else
915 vxtime.hpet_address = 0;
916 338
917 if (hpet_use_timer) { 339 if (hpet_use_timer) {
918 /* set tick_nsec to use the proper rate for HPET */ 340 /* set tick_nsec to use the proper rate for HPET */
919 tick_nsec = TICK_NSEC_HPET; 341 tick_nsec = TICK_NSEC_HPET;
920 cpu_khz = hpet_calibrate_tsc(); 342 cpu_khz = hpet_calibrate_tsc();
921 timename = "HPET"; 343 timename = "HPET";
922#ifdef CONFIG_X86_PM_TIMER
923 } else if (pmtmr_ioport && !vxtime.hpet_address) {
924 vxtime_hz = PM_TIMER_FREQUENCY;
925 timename = "PM";
926 pit_init();
927 cpu_khz = pit_calibrate_tsc();
928#endif
929 } else { 344 } else {
930 pit_init(); 345 pit_init();
931 cpu_khz = pit_calibrate_tsc(); 346 cpu_khz = pit_calibrate_tsc();
932 timename = "PIT"; 347 timename = "PIT";
933 } 348 }
934 349
935 vxtime.mode = VXTIME_TSC;
936 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
937 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
938 vxtime.last_tsc = get_cycles_sync();
939 set_cyc2ns_scale(cpu_khz);
940 setup_irq(0, &irq0);
941
942#ifndef CONFIG_SMP
943 time_init_gtod();
944#endif
945}
946
947/*
948 * Make an educated guess if the TSC is trustworthy and synchronized
949 * over all CPUs.
950 */
951__cpuinit int unsynchronized_tsc(void)
952{
953#ifdef CONFIG_SMP
954 if (apic_is_clustered_box())
955 return 1;
956#endif
957 /* Most intel systems have synchronized TSCs except for
958 multi node systems */
959 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
960#ifdef CONFIG_ACPI
961 /* But TSC doesn't tick in C3 so don't use it there */
962 if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000)
963 return 1;
964#endif
965 return 0;
966 }
967
968 /* Assume multi socket systems are not synchronized */
969 return num_present_cpus() > 1;
970}
971
972/*
973 * Decide what mode gettimeofday should use.
974 */
975void time_init_gtod(void)
976{
977 char *timetype;
978
979 if (unsynchronized_tsc()) 350 if (unsynchronized_tsc())
980 notsc = 1; 351 mark_tsc_unstable();
981 352
982 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) 353 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
983 vgetcpu_mode = VGETCPU_RDTSCP; 354 vgetcpu_mode = VGETCPU_RDTSCP;
984 else 355 else
985 vgetcpu_mode = VGETCPU_LSL; 356 vgetcpu_mode = VGETCPU_LSL;
986 357
987 if (vxtime.hpet_address && notsc) { 358 set_cyc2ns_scale(cpu_khz);
988 timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
989 if (hpet_use_timer)
990 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
991 else
992 vxtime.last = hpet_readl(HPET_COUNTER);
993 vxtime.mode = VXTIME_HPET;
994 do_gettimeoffset = do_gettimeoffset_hpet;
995#ifdef CONFIG_X86_PM_TIMER
996 /* Using PM for gettimeofday is quite slow, but we have no other
997 choice because the TSC is too unreliable on some systems. */
998 } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
999 timetype = "PM";
1000 do_gettimeoffset = do_gettimeoffset_pm;
1001 vxtime.mode = VXTIME_PMTMR;
1002 sysctl_vsyscall = 0;
1003 printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
1004#endif
1005 } else {
1006 timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
1007 vxtime.mode = VXTIME_TSC;
1008 }
1009
1010 printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
1011 vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
1012 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", 359 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
1013 cpu_khz / 1000, cpu_khz % 1000); 360 cpu_khz / 1000, cpu_khz % 1000);
1014 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; 361 setup_irq(0, &irq0);
1015 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
1016 vxtime.last_tsc = get_cycles_sync();
1017
1018 set_cyc2ns_scale(cpu_khz);
1019} 362}
1020 363
1021__setup("report_lost_ticks", time_setup);
1022 364
1023static long clock_cmos_diff; 365static long clock_cmos_diff;
1024static unsigned long sleep_start; 366static unsigned long sleep_start;
@@ -1055,7 +397,7 @@ static int timer_resume(struct sys_device *dev)
1055 sleep_length = 0; 397 sleep_length = 0;
1056 ctime = sleep_start; 398 ctime = sleep_start;
1057 } 399 }
1058 if (vxtime.hpet_address) 400 if (hpet_address)
1059 hpet_reenable(); 401 hpet_reenable();
1060 else 402 else
1061 i8254_timer_resume(); 403 i8254_timer_resume();
@@ -1064,20 +406,8 @@ static int timer_resume(struct sys_device *dev)
1064 write_seqlock_irqsave(&xtime_lock,flags); 406 write_seqlock_irqsave(&xtime_lock,flags);
1065 xtime.tv_sec = sec; 407 xtime.tv_sec = sec;
1066 xtime.tv_nsec = 0; 408 xtime.tv_nsec = 0;
1067 if (vxtime.mode == VXTIME_HPET) {
1068 if (hpet_use_timer)
1069 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
1070 else
1071 vxtime.last = hpet_readl(HPET_COUNTER);
1072#ifdef CONFIG_X86_PM_TIMER
1073 } else if (vxtime.mode == VXTIME_PMTMR) {
1074 pmtimer_resume();
1075#endif
1076 } else
1077 vxtime.last_tsc = get_cycles_sync();
1078 write_sequnlock_irqrestore(&xtime_lock,flags);
1079 jiffies += sleep_length; 409 jiffies += sleep_length;
1080 monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); 410 write_sequnlock_irqrestore(&xtime_lock,flags);
1081 touch_softlockup_watchdog(); 411 touch_softlockup_watchdog();
1082 return 0; 412 return 0;
1083} 413}
@@ -1103,270 +433,3 @@ static int time_init_device(void)
1103} 433}
1104 434
1105device_initcall(time_init_device); 435device_initcall(time_init_device);
1106
1107#ifdef CONFIG_HPET_EMULATE_RTC
1108/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
1109 * is enabled, we support RTC interrupt functionality in software.
1110 * RTC has 3 kinds of interrupts:
1111 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
1112 * is updated
1113 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
1114 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
1115 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
1116 * (1) and (2) above are implemented using polling at a frequency of
1117 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
1118 * overhead. (DEFAULT_RTC_INT_FREQ)
1119 * For (3), we use interrupts at 64Hz or user specified periodic
1120 * frequency, whichever is higher.
1121 */
1122#include <linux/rtc.h>
1123
1124#define DEFAULT_RTC_INT_FREQ 64
1125#define RTC_NUM_INTS 1
1126
1127static unsigned long UIE_on;
1128static unsigned long prev_update_sec;
1129
1130static unsigned long AIE_on;
1131static struct rtc_time alarm_time;
1132
1133static unsigned long PIE_on;
1134static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
1135static unsigned long PIE_count;
1136
1137static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
1138static unsigned int hpet_t1_cmp; /* cached comparator register */
1139
1140int is_hpet_enabled(void)
1141{
1142 return vxtime.hpet_address != 0;
1143}
1144
1145/*
1146 * Timer 1 for RTC, we do not use periodic interrupt feature,
1147 * even if HPET supports periodic interrupts on Timer 1.
1148 * The reason being, to set up a periodic interrupt in HPET, we need to
1149 * stop the main counter. And if we do that everytime someone diables/enables
1150 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
1151 * So, for the time being, simulate the periodic interrupt in software.
1152 *
1153 * hpet_rtc_timer_init() is called for the first time and during subsequent
1154 * interuppts reinit happens through hpet_rtc_timer_reinit().
1155 */
1156int hpet_rtc_timer_init(void)
1157{
1158 unsigned int cfg, cnt;
1159 unsigned long flags;
1160
1161 if (!is_hpet_enabled())
1162 return 0;
1163 /*
1164 * Set the counter 1 and enable the interrupts.
1165 */
1166 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
1167 hpet_rtc_int_freq = PIE_freq;
1168 else
1169 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1170
1171 local_irq_save(flags);
1172
1173 cnt = hpet_readl(HPET_COUNTER);
1174 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
1175 hpet_writel(cnt, HPET_T1_CMP);
1176 hpet_t1_cmp = cnt;
1177
1178 cfg = hpet_readl(HPET_T1_CFG);
1179 cfg &= ~HPET_TN_PERIODIC;
1180 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
1181 hpet_writel(cfg, HPET_T1_CFG);
1182
1183 local_irq_restore(flags);
1184
1185 return 1;
1186}
1187
1188static void hpet_rtc_timer_reinit(void)
1189{
1190 unsigned int cfg, cnt, ticks_per_int, lost_ints;
1191
1192 if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
1193 cfg = hpet_readl(HPET_T1_CFG);
1194 cfg &= ~HPET_TN_ENABLE;
1195 hpet_writel(cfg, HPET_T1_CFG);
1196 return;
1197 }
1198
1199 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
1200 hpet_rtc_int_freq = PIE_freq;
1201 else
1202 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1203
1204 /* It is more accurate to use the comparator value than current count.*/
1205 ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
1206 hpet_t1_cmp += ticks_per_int;
1207 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
1208
1209 /*
1210 * If the interrupt handler was delayed too long, the write above tries
1211 * to schedule the next interrupt in the past and the hardware would
1212 * not interrupt until the counter had wrapped around.
1213 * So we have to check that the comparator wasn't set to a past time.
1214 */
1215 cnt = hpet_readl(HPET_COUNTER);
1216 if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
1217 lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
1218 /* Make sure that, even with the time needed to execute
1219 * this code, the next scheduled interrupt has been moved
1220 * back to the future: */
1221 lost_ints++;
1222
1223 hpet_t1_cmp += lost_ints * ticks_per_int;
1224 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
1225
1226 if (PIE_on)
1227 PIE_count += lost_ints;
1228
1229 if (printk_ratelimit())
1230 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
1231 hpet_rtc_int_freq);
1232 }
1233}
1234
1235/*
1236 * The functions below are called from rtc driver.
1237 * Return 0 if HPET is not being used.
1238 * Otherwise do the necessary changes and return 1.
1239 */
1240int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
1241{
1242 if (!is_hpet_enabled())
1243 return 0;
1244
1245 if (bit_mask & RTC_UIE)
1246 UIE_on = 0;
1247 if (bit_mask & RTC_PIE)
1248 PIE_on = 0;
1249 if (bit_mask & RTC_AIE)
1250 AIE_on = 0;
1251
1252 return 1;
1253}
1254
1255int hpet_set_rtc_irq_bit(unsigned long bit_mask)
1256{
1257 int timer_init_reqd = 0;
1258
1259 if (!is_hpet_enabled())
1260 return 0;
1261
1262 if (!(PIE_on | AIE_on | UIE_on))
1263 timer_init_reqd = 1;
1264
1265 if (bit_mask & RTC_UIE) {
1266 UIE_on = 1;
1267 }
1268 if (bit_mask & RTC_PIE) {
1269 PIE_on = 1;
1270 PIE_count = 0;
1271 }
1272 if (bit_mask & RTC_AIE) {
1273 AIE_on = 1;
1274 }
1275
1276 if (timer_init_reqd)
1277 hpet_rtc_timer_init();
1278
1279 return 1;
1280}
1281
1282int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
1283{
1284 if (!is_hpet_enabled())
1285 return 0;
1286
1287 alarm_time.tm_hour = hrs;
1288 alarm_time.tm_min = min;
1289 alarm_time.tm_sec = sec;
1290
1291 return 1;
1292}
1293
1294int hpet_set_periodic_freq(unsigned long freq)
1295{
1296 if (!is_hpet_enabled())
1297 return 0;
1298
1299 PIE_freq = freq;
1300 PIE_count = 0;
1301
1302 return 1;
1303}
1304
1305int hpet_rtc_dropped_irq(void)
1306{
1307 if (!is_hpet_enabled())
1308 return 0;
1309
1310 return 1;
1311}
1312
1313irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
1314{
1315 struct rtc_time curr_time;
1316 unsigned long rtc_int_flag = 0;
1317 int call_rtc_interrupt = 0;
1318
1319 hpet_rtc_timer_reinit();
1320
1321 if (UIE_on | AIE_on) {
1322 rtc_get_rtc_time(&curr_time);
1323 }
1324 if (UIE_on) {
1325 if (curr_time.tm_sec != prev_update_sec) {
1326 /* Set update int info, call real rtc int routine */
1327 call_rtc_interrupt = 1;
1328 rtc_int_flag = RTC_UF;
1329 prev_update_sec = curr_time.tm_sec;
1330 }
1331 }
1332 if (PIE_on) {
1333 PIE_count++;
1334 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
1335 /* Set periodic int info, call real rtc int routine */
1336 call_rtc_interrupt = 1;
1337 rtc_int_flag |= RTC_PF;
1338 PIE_count = 0;
1339 }
1340 }
1341 if (AIE_on) {
1342 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
1343 (curr_time.tm_min == alarm_time.tm_min) &&
1344 (curr_time.tm_hour == alarm_time.tm_hour)) {
1345 /* Set alarm int info, call real rtc int routine */
1346 call_rtc_interrupt = 1;
1347 rtc_int_flag |= RTC_AF;
1348 }
1349 }
1350 if (call_rtc_interrupt) {
1351 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
1352 rtc_interrupt(rtc_int_flag, dev_id);
1353 }
1354 return IRQ_HANDLED;
1355}
1356#endif
1357
1358static int __init nohpet_setup(char *s)
1359{
1360 nohpet = 1;
1361 return 1;
1362}
1363
1364__setup("nohpet", nohpet_setup);
1365
1366int __init notsc_setup(char *s)
1367{
1368 notsc = 1;
1369 return 1;
1370}
1371
1372__setup("notsc", notsc_setup);
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
new file mode 100644
index 000000000000..895831865019
--- /dev/null
+++ b/arch/x86_64/kernel/tsc.c
@@ -0,0 +1,226 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/interrupt.h>
4#include <linux/init.h>
5#include <linux/clocksource.h>
6#include <linux/time.h>
7#include <linux/acpi.h>
8#include <linux/cpufreq.h>
9
10#include <asm/timex.h>
11
12static int notsc __initdata = 0;
13
14unsigned int cpu_khz; /* TSC clocks / usec, not used here */
15EXPORT_SYMBOL(cpu_khz);
16
17static unsigned int cyc2ns_scale __read_mostly;
18
19void set_cyc2ns_scale(unsigned long khz)
20{
21 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
22}
23
24static unsigned long long cycles_2_ns(unsigned long long cyc)
25{
26 return (cyc * cyc2ns_scale) >> NS_SCALE;
27}
28
29unsigned long long sched_clock(void)
30{
31 unsigned long a = 0;
32
33 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
34 * which means it is not completely exact and may not be monotonous
35 * between CPUs. But the errors should be too small to matter for
36 * scheduling purposes.
37 */
38
39 rdtscll(a);
40 return cycles_2_ns(a);
41}
42
43static int tsc_unstable;
44
45static inline int check_tsc_unstable(void)
46{
47 return tsc_unstable;
48}
49#ifdef CONFIG_CPU_FREQ
50
51/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
52 * changes.
53 *
54 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
55 * not that important because current Opteron setups do not support
56 * scaling on SMP anyroads.
57 *
58 * Should fix up last_tsc too. Currently gettimeofday in the
59 * first tick after the change will be slightly wrong.
60 */
61
62#include <linux/workqueue.h>
63
64static unsigned int cpufreq_delayed_issched = 0;
65static unsigned int cpufreq_init = 0;
66static struct work_struct cpufreq_delayed_get_work;
67
68static void handle_cpufreq_delayed_get(struct work_struct *v)
69{
70 unsigned int cpu;
71 for_each_online_cpu(cpu) {
72 cpufreq_get(cpu);
73 }
74 cpufreq_delayed_issched = 0;
75}
76
77static unsigned int ref_freq = 0;
78static unsigned long loops_per_jiffy_ref = 0;
79
80static unsigned long cpu_khz_ref = 0;
81
82static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
83 void *data)
84{
85 struct cpufreq_freqs *freq = data;
86 unsigned long *lpj, dummy;
87
88 if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
89 return 0;
90
91 lpj = &dummy;
92 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
93#ifdef CONFIG_SMP
94 lpj = &cpu_data[freq->cpu].loops_per_jiffy;
95#else
96 lpj = &boot_cpu_data.loops_per_jiffy;
97#endif
98
99 if (!ref_freq) {
100 ref_freq = freq->old;
101 loops_per_jiffy_ref = *lpj;
102 cpu_khz_ref = cpu_khz;
103 }
104 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
105 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
106 (val == CPUFREQ_RESUMECHANGE)) {
107 *lpj =
108 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
109
110 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
111 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
112 mark_tsc_unstable();
113 }
114
115 set_cyc2ns_scale(cpu_khz_ref);
116
117 return 0;
118}
119
120static struct notifier_block time_cpufreq_notifier_block = {
121 .notifier_call = time_cpufreq_notifier
122};
123
124static int __init cpufreq_tsc(void)
125{
126 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
127 if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
128 CPUFREQ_TRANSITION_NOTIFIER))
129 cpufreq_init = 1;
130 return 0;
131}
132
133core_initcall(cpufreq_tsc);
134
135#endif
136
137static int tsc_unstable = 0;
138
139/*
140 * Make an educated guess if the TSC is trustworthy and synchronized
141 * over all CPUs.
142 */
143__cpuinit int unsynchronized_tsc(void)
144{
145 if (tsc_unstable)
146 return 1;
147
148#ifdef CONFIG_SMP
149 if (apic_is_clustered_box())
150 return 1;
151#endif
152 /* Most intel systems have synchronized TSCs except for
153 multi node systems */
154 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
155#ifdef CONFIG_ACPI
156 /* But TSC doesn't tick in C3 so don't use it there */
157 if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000)
158 return 1;
159#endif
160 return 0;
161 }
162
163 /* Assume multi socket systems are not synchronized */
164 return num_present_cpus() > 1;
165}
166
167int __init notsc_setup(char *s)
168{
169 notsc = 1;
170 return 1;
171}
172
173__setup("notsc", notsc_setup);
174
175
176/* clock source code: */
177static cycle_t read_tsc(void)
178{
179 cycle_t ret = (cycle_t)get_cycles_sync();
180 return ret;
181}
182
183static cycle_t __vsyscall_fn vread_tsc(void)
184{
185 cycle_t ret = (cycle_t)get_cycles_sync();
186 return ret;
187}
188
189static struct clocksource clocksource_tsc = {
190 .name = "tsc",
191 .rating = 300,
192 .read = read_tsc,
193 .mask = CLOCKSOURCE_MASK(64),
194 .shift = 22,
195 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
196 CLOCK_SOURCE_MUST_VERIFY,
197 .vread = vread_tsc,
198};
199
200void mark_tsc_unstable(void)
201{
202 if (!tsc_unstable) {
203 tsc_unstable = 1;
204 /* Change only the rating, when not registered */
205 if (clocksource_tsc.mult)
206 clocksource_change_rating(&clocksource_tsc, 0);
207 else
208 clocksource_tsc.rating = 0;
209 }
210}
211EXPORT_SYMBOL_GPL(mark_tsc_unstable);
212
213static int __init init_tsc_clocksource(void)
214{
215 if (!notsc) {
216 clocksource_tsc.mult = clocksource_khz2mult(cpu_khz,
217 clocksource_tsc.shift);
218 if (check_tsc_unstable())
219 clocksource_tsc.rating = 0;
220
221 return clocksource_register(&clocksource_tsc);
222 }
223 return 0;
224}
225
226module_init(init_tsc_clocksource);
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
new file mode 100644
index 000000000000..014f0db45dfa
--- /dev/null
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
1/*
2 * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
3 *
4 * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
5 *
6 * We check whether all boot CPUs have their TSC's synchronized,
7 * print a warning if not and turn off the TSC clock-source.
8 *
9 * The warp-check is point-to-point between two CPUs, the CPU
10 * initiating the bootup is the 'source CPU', the freshly booting
11 * CPU is the 'target CPU'.
12 *
13 * Only two CPUs may participate - they can enter in any order.
14 * ( The serial nature of the boot logic and the CPU hotplug lock
15 * protects against more than 2 CPUs entering this code. )
16 */
17#include <linux/spinlock.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h>
21#include <linux/nmi.h>
22#include <asm/tsc.h>
23
24/*
25 * Entry/exit counters that make sure that both CPUs
26 * run the measurement code at once:
27 */
28static __cpuinitdata atomic_t start_count;
29static __cpuinitdata atomic_t stop_count;
30
31/*
32 * We use a raw spinlock in this exceptional case, because
33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps:
35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps;
40
41/*
42 * TSC-warp measurement loop running on both CPUs:
43 */
44static __cpuinit void check_tsc_warp(void)
45{
46 cycles_t start, now, prev, end;
47 int i;
48
49 start = get_cycles_sync();
50 /*
51 * The measurement runs for 20 msecs:
52 */
53 end = start + cpu_khz * 20ULL;
54 now = start;
55
56 for (i = 0; ; i++) {
57 /*
58 * We take the global lock, measure TSC, save the
59 * previous TSC that was measured (possibly on
60 * another CPU) and update the previous TSC timestamp.
61 */
62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc;
64 now = get_cycles_sync();
65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock);
67
68 /*
69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million
71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]):
73 */
74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000)
76 break;
77 cpu_relax();
78 touch_nmi_watchdog();
79 }
80 /*
81 * Outside the critical section we can now see whether
82 * we saw a time-warp of the TSC going backwards:
83 */
84 if (unlikely(prev > now)) {
85 __raw_spin_lock(&sync_lock);
86 max_warp = max(max_warp, prev - now);
87 nr_warps++;
88 __raw_spin_unlock(&sync_lock);
89 }
90
91 }
92}
93
94/*
95 * Source CPU calls into this - it waits for the freshly booted
96 * target CPU to arrive and then starts the measurement:
97 */
98void __cpuinit check_tsc_sync_source(int cpu)
99{
100 int cpus = 2;
101
102 /*
103 * No need to check if we already know that the TSC is not
104 * synchronized:
105 */
106 if (unsynchronized_tsc())
107 return;
108
109 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
110 smp_processor_id(), cpu);
111
112 /*
113 * Reset it - in case this is a second bootup:
114 */
115 atomic_set(&stop_count, 0);
116
117 /*
118 * Wait for the target to arrive:
119 */
120 while (atomic_read(&start_count) != cpus-1)
121 cpu_relax();
122 /*
123 * Trigger the target to continue into the measurement too:
124 */
125 atomic_inc(&start_count);
126
127 check_tsc_warp();
128
129 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax();
131
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) {
138 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable();
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else {
146 printk(" passed.\n");
147 }
148
149 /*
150 * Let the target continue with the bootup:
151 */
152 atomic_inc(&stop_count);
153}
154
155/*
156 * Freshly booted CPUs call into this:
157 */
158void __cpuinit check_tsc_sync_target(void)
159{
160 int cpus = 2;
161
162 if (unsynchronized_tsc())
163 return;
164
165 /*
166 * Register this CPU's participation and wait for the
167 * source CPU to start the measurement:
168 */
169 atomic_inc(&start_count);
170 while (atomic_read(&start_count) != cpus)
171 cpu_relax();
172
173 check_tsc_warp();
174
175 /*
176 * Ok, we are done:
177 */
178 atomic_inc(&stop_count);
179
180 /*
181 * Wait for the source CPU to print stuff:
182 */
183 while (atomic_read(&stop_count) != cpus)
184 cpu_relax();
185}
186#undef NR_LOOPS
187
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index c360c4225244..b73212c0a550 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -88,31 +88,25 @@ SECTIONS
88 __vsyscall_0 = VSYSCALL_VIRT_ADDR; 88 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
89 89
90 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 90 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
91 .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } 91 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
92 xtime_lock = VVIRT(.xtime_lock); 92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 93 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
94 .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } 94 { *(.vsyscall_gtod_data) }
95 vxtime = VVIRT(.vxtime); 95 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
96 96
97 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } 97 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
98 vgetcpu_mode = VVIRT(.vgetcpu_mode); 98 vgetcpu_mode = VVIRT(.vgetcpu_mode);
99 99
100 .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
101 sys_tz = VVIRT(.sys_tz);
102
103 .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
104 sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
105
106 .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
107 xtime = VVIRT(.xtime);
108
109 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 100 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
110 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } 101 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
111 jiffies = VVIRT(.jiffies); 102 jiffies = VVIRT(.jiffies);
112 103
113 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } 104 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
114 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } 105 { *(.vsyscall_1) }
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } 106 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
107 { *(.vsyscall_2) }
108 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
109 { *(.vsyscall_3) }
116 110
117 . = VSYSCALL_VIRT_ADDR + 4096; 111 . = VSYSCALL_VIRT_ADDR + 4096;
118 112
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 313dc6ad780b..180ff919eaf9 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
26#include <linux/seqlock.h> 26#include <linux/seqlock.h>
27#include <linux/jiffies.h> 27#include <linux/jiffies.h>
28#include <linux/sysctl.h> 28#include <linux/sysctl.h>
29#include <linux/clocksource.h>
29#include <linux/getcpu.h> 30#include <linux/getcpu.h>
30#include <linux/cpu.h> 31#include <linux/cpu.h>
31#include <linux/smp.h> 32#include <linux/smp.h>
@@ -34,6 +35,7 @@
34#include <asm/vsyscall.h> 35#include <asm/vsyscall.h>
35#include <asm/pgtable.h> 36#include <asm/pgtable.h>
36#include <asm/page.h> 37#include <asm/page.h>
38#include <asm/unistd.h>
37#include <asm/fixmap.h> 39#include <asm/fixmap.h>
38#include <asm/errno.h> 40#include <asm/errno.h>
39#include <asm/io.h> 41#include <asm/io.h>
@@ -44,56 +46,41 @@
44#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 46#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
45#define __syscall_clobber "r11","rcx","memory" 47#define __syscall_clobber "r11","rcx","memory"
46 48
47int __sysctl_vsyscall __section_sysctl_vsyscall = 1; 49struct vsyscall_gtod_data_t {
48seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; 50 seqlock_t lock;
51 int sysctl_enabled;
52 struct timeval wall_time_tv;
53 struct timezone sys_tz;
54 cycle_t offset_base;
55 struct clocksource clock;
56};
49int __vgetcpu_mode __section_vgetcpu_mode; 57int __vgetcpu_mode __section_vgetcpu_mode;
50 58
51#include <asm/unistd.h> 59struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data =
52
53static __always_inline void timeval_normalize(struct timeval * tv)
54{ 60{
55 time_t __sec; 61 .lock = SEQLOCK_UNLOCKED,
56 62 .sysctl_enabled = 1,
57 __sec = tv->tv_usec / 1000000; 63};
58 if (__sec) {
59 tv->tv_usec %= 1000000;
60 tv->tv_sec += __sec;
61 }
62}
63 64
64static __always_inline void do_vgettimeofday(struct timeval * tv) 65void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
65{ 66{
66 long sequence, t; 67 unsigned long flags;
67 unsigned long sec, usec; 68
68 69 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
69 do { 70 /* copy vsyscall data */
70 sequence = read_seqbegin(&__xtime_lock); 71 vsyscall_gtod_data.clock = *clock;
71 72 vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
72 sec = __xtime.tv_sec; 73 vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
73 usec = __xtime.tv_nsec / 1000; 74 vsyscall_gtod_data.sys_tz = sys_tz;
74 75 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
75 if (__vxtime.mode != VXTIME_HPET) {
76 t = get_cycles_sync();
77 if (t < __vxtime.last_tsc)
78 t = __vxtime.last_tsc;
79 usec += ((t - __vxtime.last_tsc) *
80 __vxtime.tsc_quot) >> 32;
81 /* See comment in x86_64 do_gettimeofday. */
82 } else {
83 usec += ((readl((void __iomem *)
84 fix_to_virt(VSYSCALL_HPET) + 0xf0) -
85 __vxtime.last) * __vxtime.quot) >> 32;
86 }
87 } while (read_seqretry(&__xtime_lock, sequence));
88
89 tv->tv_sec = sec + usec / 1000000;
90 tv->tv_usec = usec % 1000000;
91} 76}
92 77
93/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ 78/* RED-PEN may want to readd seq locking, but then the variable should be
79 * write-once.
80 */
94static __always_inline void do_get_tz(struct timezone * tz) 81static __always_inline void do_get_tz(struct timezone * tz)
95{ 82{
96 *tz = __sys_tz; 83 *tz = __vsyscall_gtod_data.sys_tz;
97} 84}
98 85
99static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 86static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -101,7 +88,8 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
101 int ret; 88 int ret;
102 asm volatile("vsysc2: syscall" 89 asm volatile("vsysc2: syscall"
103 : "=a" (ret) 90 : "=a" (ret)
104 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); 91 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
92 : __syscall_clobber );
105 return ret; 93 return ret;
106} 94}
107 95
@@ -114,10 +102,44 @@ static __always_inline long time_syscall(long *t)
114 return secs; 102 return secs;
115} 103}
116 104
105static __always_inline void do_vgettimeofday(struct timeval * tv)
106{
107 cycle_t now, base, mask, cycle_delta;
108 unsigned long seq, mult, shift, nsec_delta;
109 cycle_t (*vread)(void);
110 do {
111 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
112
113 vread = __vsyscall_gtod_data.clock.vread;
114 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
115 gettimeofday(tv,0);
116 return;
117 }
118 now = vread();
119 base = __vsyscall_gtod_data.clock.cycle_last;
120 mask = __vsyscall_gtod_data.clock.mask;
121 mult = __vsyscall_gtod_data.clock.mult;
122 shift = __vsyscall_gtod_data.clock.shift;
123
124 *tv = __vsyscall_gtod_data.wall_time_tv;
125
126 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
127
128 /* calculate interval: */
129 cycle_delta = (now - base) & mask;
130 /* convert to nsecs: */
131 nsec_delta = (cycle_delta * mult) >> shift;
132
133 /* convert to usecs and add to timespec: */
134 tv->tv_usec += nsec_delta / NSEC_PER_USEC;
135 while (tv->tv_usec > USEC_PER_SEC) {
136 tv->tv_sec += 1;
137 tv->tv_usec -= USEC_PER_SEC;
138 }
139}
140
117int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 141int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
118{ 142{
119 if (!__sysctl_vsyscall)
120 return gettimeofday(tv,tz);
121 if (tv) 143 if (tv)
122 do_vgettimeofday(tv); 144 do_vgettimeofday(tv);
123 if (tz) 145 if (tz)
@@ -129,11 +151,11 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
129 * unlikely */ 151 * unlikely */
130time_t __vsyscall(1) vtime(time_t *t) 152time_t __vsyscall(1) vtime(time_t *t)
131{ 153{
132 if (!__sysctl_vsyscall) 154 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
133 return time_syscall(t); 155 return time_syscall(t);
134 else if (t) 156 else if (t)
135 *t = __xtime.tv_sec; 157 *t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
136 return __xtime.tv_sec; 158 return __vsyscall_gtod_data.wall_time_tv.tv_sec;
137} 159}
138 160
139/* Fast way to get current CPU and node. 161/* Fast way to get current CPU and node.
@@ -210,7 +232,7 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
210 ret = -ENOMEM; 232 ret = -ENOMEM;
211 goto out; 233 goto out;
212 } 234 }
213 if (!sysctl_vsyscall) { 235 if (!vsyscall_gtod_data.sysctl_enabled) {
214 writew(SYSCALL, map1); 236 writew(SYSCALL, map1);
215 writew(SYSCALL, map2); 237 writew(SYSCALL, map2);
216 } else { 238 } else {
@@ -232,7 +254,8 @@ static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
232 254
233static ctl_table kernel_table2[] = { 255static ctl_table kernel_table2[] = {
234 { .ctl_name = 99, .procname = "vsyscall64", 256 { .ctl_name = 99, .procname = "vsyscall64",
235 .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, 257 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
258 .mode = 0644,
236 .strategy = vsyscall_sysctl_nostrat, 259 .strategy = vsyscall_sysctl_nostrat,
237 .proc_handler = vsyscall_sysctl_change }, 260 .proc_handler = vsyscall_sysctl_change },
238 {} 261 {}
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 6c6751b1405b..8206fc1ecc58 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -39,6 +39,17 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/sched.h> /* need_resched() */ 40#include <linux/sched.h> /* need_resched() */
41#include <linux/latency.h> 41#include <linux/latency.h>
42#include <linux/clockchips.h>
43
44/*
45 * Include the apic definitions for x86 to have the APIC timer related defines
46 * available also for UP (on SMP it gets magically included via linux/smp.h).
47 * asm/acpi.h is not an option, as it would require more include magic. Also
48 * creating an empty asm-ia64/apic.h would just trade pest vs. cholera.
49 */
50#ifdef CONFIG_X86
51#include <asm/apic.h>
52#endif
42 53
43#include <asm/io.h> 54#include <asm/io.h>
44#include <asm/uaccess.h> 55#include <asm/uaccess.h>
@@ -238,6 +249,81 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
238 } 249 }
239} 250}
240 251
252#ifdef ARCH_APICTIMER_STOPS_ON_C3
253
254/*
255 * Some BIOS implementations switch to C3 in the published C2 state.
256 * This seems to be a common problem on AMD boxen, but other vendors
257 * are affected too. We pick the most conservative approach: we assume
258 * that the local APIC stops in both C2 and C3.
259 */
260static void acpi_timer_check_state(int state, struct acpi_processor *pr,
261 struct acpi_processor_cx *cx)
262{
263 struct acpi_processor_power *pwr = &pr->power;
264
265 /*
266 * Check, if one of the previous states already marked the lapic
267 * unstable
268 */
269 if (pwr->timer_broadcast_on_state < state)
270 return;
271
272 if (cx->type >= ACPI_STATE_C2)
273 pr->power.timer_broadcast_on_state = state;
274}
275
276static void acpi_propagate_timer_broadcast(struct acpi_processor *pr)
277{
278#ifdef CONFIG_GENERIC_CLOCKEVENTS
279 unsigned long reason;
280
281 reason = pr->power.timer_broadcast_on_state < INT_MAX ?
282 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
283
284 clockevents_notify(reason, &pr->id);
285#else
286 cpumask_t mask = cpumask_of_cpu(pr->id);
287
288 if (pr->power.timer_broadcast_on_state < INT_MAX)
289 on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1);
290 else
291 on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1);
292#endif
293}
294
295/* Power(C) State timer broadcast control */
296static void acpi_state_timer_broadcast(struct acpi_processor *pr,
297 struct acpi_processor_cx *cx,
298 int broadcast)
299{
300#ifdef CONFIG_GENERIC_CLOCKEVENTS
301
302 int state = cx - pr->power.states;
303
304 if (state >= pr->power.timer_broadcast_on_state) {
305 unsigned long reason;
306
307 reason = broadcast ? CLOCK_EVT_NOTIFY_BROADCAST_ENTER :
308 CLOCK_EVT_NOTIFY_BROADCAST_EXIT;
309 clockevents_notify(reason, &pr->id);
310 }
311#endif
312}
313
314#else
315
316static void acpi_timer_check_state(int state, struct acpi_processor *pr,
317 struct acpi_processor_cx *cstate) { }
318static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { }
319static void acpi_state_timer_broadcast(struct acpi_processor *pr,
320 struct acpi_processor_cx *cx,
321 int broadcast)
322{
323}
324
325#endif
326
241static void acpi_processor_idle(void) 327static void acpi_processor_idle(void)
242{ 328{
243 struct acpi_processor *pr = NULL; 329 struct acpi_processor *pr = NULL;
@@ -382,6 +468,7 @@ static void acpi_processor_idle(void)
382 /* Get start time (ticks) */ 468 /* Get start time (ticks) */
383 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 469 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
384 /* Invoke C2 */ 470 /* Invoke C2 */
471 acpi_state_timer_broadcast(pr, cx, 1);
385 acpi_cstate_enter(cx); 472 acpi_cstate_enter(cx);
386 /* Get end time (ticks) */ 473 /* Get end time (ticks) */
387 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); 474 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -396,6 +483,7 @@ static void acpi_processor_idle(void)
396 /* Compute time (ticks) that we were actually asleep */ 483 /* Compute time (ticks) that we were actually asleep */
397 sleep_ticks = 484 sleep_ticks =
398 ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; 485 ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
486 acpi_state_timer_broadcast(pr, cx, 0);
399 break; 487 break;
400 488
401 case ACPI_STATE_C3: 489 case ACPI_STATE_C3:
@@ -417,6 +505,7 @@ static void acpi_processor_idle(void)
417 /* Get start time (ticks) */ 505 /* Get start time (ticks) */
418 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 506 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
419 /* Invoke C3 */ 507 /* Invoke C3 */
508 acpi_state_timer_broadcast(pr, cx, 1);
420 acpi_cstate_enter(cx); 509 acpi_cstate_enter(cx);
421 /* Get end time (ticks) */ 510 /* Get end time (ticks) */
422 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); 511 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -436,6 +525,7 @@ static void acpi_processor_idle(void)
436 /* Compute time (ticks) that we were actually asleep */ 525 /* Compute time (ticks) that we were actually asleep */
437 sleep_ticks = 526 sleep_ticks =
438 ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; 527 ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
528 acpi_state_timer_broadcast(pr, cx, 0);
439 break; 529 break;
440 530
441 default: 531 default:
@@ -904,11 +994,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
904 unsigned int i; 994 unsigned int i;
905 unsigned int working = 0; 995 unsigned int working = 0;
906 996
907#ifdef ARCH_APICTIMER_STOPS_ON_C3 997 pr->power.timer_broadcast_on_state = INT_MAX;
908 int timer_broadcast = 0;
909 cpumask_t mask = cpumask_of_cpu(pr->id);
910 on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1);
911#endif
912 998
913 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { 999 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
914 struct acpi_processor_cx *cx = &pr->power.states[i]; 1000 struct acpi_processor_cx *cx = &pr->power.states[i];
@@ -920,21 +1006,14 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
920 1006
921 case ACPI_STATE_C2: 1007 case ACPI_STATE_C2:
922 acpi_processor_power_verify_c2(cx); 1008 acpi_processor_power_verify_c2(cx);
923#ifdef ARCH_APICTIMER_STOPS_ON_C3 1009 if (cx->valid)
924 /* Some AMD systems fake C3 as C2, but still 1010 acpi_timer_check_state(i, pr, cx);
925 have timer troubles */
926 if (cx->valid &&
927 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
928 timer_broadcast++;
929#endif
930 break; 1011 break;
931 1012
932 case ACPI_STATE_C3: 1013 case ACPI_STATE_C3:
933 acpi_processor_power_verify_c3(pr, cx); 1014 acpi_processor_power_verify_c3(pr, cx);
934#ifdef ARCH_APICTIMER_STOPS_ON_C3
935 if (cx->valid) 1015 if (cx->valid)
936 timer_broadcast++; 1016 acpi_timer_check_state(i, pr, cx);
937#endif
938 break; 1017 break;
939 } 1018 }
940 1019
@@ -942,10 +1021,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
942 working++; 1021 working++;
943 } 1022 }
944 1023
945#ifdef ARCH_APICTIMER_STOPS_ON_C3 1024 acpi_propagate_timer_broadcast(pr);
946 if (timer_broadcast)
947 on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1);
948#endif
949 1025
950 return (working); 1026 return (working);
951} 1027}
diff --git a/drivers/char/agp/Makefile b/drivers/char/agp/Makefile
index 3e581603d0a8..a0d04a23dacd 100644
--- a/drivers/char/agp/Makefile
+++ b/drivers/char/agp/Makefile
@@ -1,6 +1,7 @@
1agpgart-y := backend.o frontend.o generic.o isoch.o 1agpgart-y := backend.o frontend.o generic.o isoch.o
2 2
3obj-$(CONFIG_AGP) += agpgart.o 3obj-$(CONFIG_AGP) += agpgart.o
4obj-$(CONFIG_COMPAT) += compat_ioctl.o
4obj-$(CONFIG_AGP_ALI) += ali-agp.o 5obj-$(CONFIG_AGP_ALI) += ali-agp.o
5obj-$(CONFIG_AGP_ATI) += ati-agp.o 6obj-$(CONFIG_AGP_ATI) += ati-agp.o
6obj-$(CONFIG_AGP_AMD) += amd-k7-agp.o 7obj-$(CONFIG_AGP_AMD) += amd-k7-agp.o
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 1d59e2a5b9aa..9bd68d9f0f59 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -114,6 +114,7 @@ struct agp_bridge_driver {
114 void (*free_by_type)(struct agp_memory *); 114 void (*free_by_type)(struct agp_memory *);
115 void *(*agp_alloc_page)(struct agp_bridge_data *); 115 void *(*agp_alloc_page)(struct agp_bridge_data *);
116 void (*agp_destroy_page)(void *); 116 void (*agp_destroy_page)(void *);
117 int (*agp_type_to_mask_type) (struct agp_bridge_data *, int);
117}; 118};
118 119
119struct agp_bridge_data { 120struct agp_bridge_data {
@@ -218,6 +219,7 @@ struct agp_bridge_data {
218#define I810_PTE_MAIN_UNCACHED 0x00000000 219#define I810_PTE_MAIN_UNCACHED 0x00000000
219#define I810_PTE_LOCAL 0x00000002 220#define I810_PTE_LOCAL 0x00000002
220#define I810_PTE_VALID 0x00000001 221#define I810_PTE_VALID 0x00000001
222#define I830_PTE_SYSTEM_CACHED 0x00000006
221#define I810_SMRAM_MISCC 0x70 223#define I810_SMRAM_MISCC 0x70
222#define I810_GFX_MEM_WIN_SIZE 0x00010000 224#define I810_GFX_MEM_WIN_SIZE 0x00010000
223#define I810_GFX_MEM_WIN_32M 0x00010000 225#define I810_GFX_MEM_WIN_32M 0x00010000
@@ -270,8 +272,16 @@ void global_cache_flush(void);
270void get_agp_version(struct agp_bridge_data *bridge); 272void get_agp_version(struct agp_bridge_data *bridge);
271unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge, 273unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge,
272 unsigned long addr, int type); 274 unsigned long addr, int type);
275int agp_generic_type_to_mask_type(struct agp_bridge_data *bridge,
276 int type);
273struct agp_bridge_data *agp_generic_find_bridge(struct pci_dev *pdev); 277struct agp_bridge_data *agp_generic_find_bridge(struct pci_dev *pdev);
274 278
279/* generic functions for user-populated AGP memory types */
280struct agp_memory *agp_generic_alloc_user(size_t page_count, int type);
281void agp_alloc_page_array(size_t size, struct agp_memory *mem);
282void agp_free_page_array(struct agp_memory *mem);
283
284
275/* generic routines for agp>=3 */ 285/* generic routines for agp>=3 */
276int agp3_generic_fetch_size(void); 286int agp3_generic_fetch_size(void);
277void agp3_generic_tlbflush(struct agp_memory *mem); 287void agp3_generic_tlbflush(struct agp_memory *mem);
@@ -288,6 +298,8 @@ extern struct aper_size_info_16 agp3_generic_sizes[];
288extern int agp_off; 298extern int agp_off;
289extern int agp_try_unsupported_boot; 299extern int agp_try_unsupported_boot;
290 300
301long compat_agp_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
302
291/* Chipset independant registers (from AGP Spec) */ 303/* Chipset independant registers (from AGP Spec) */
292#define AGP_APBASE 0x10 304#define AGP_APBASE 0x10
293 305
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 5a31ec7c62fc..98177a93076f 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -214,6 +214,7 @@ static struct agp_bridge_driver ali_generic_bridge = {
214 .free_by_type = agp_generic_free_by_type, 214 .free_by_type = agp_generic_free_by_type,
215 .agp_alloc_page = agp_generic_alloc_page, 215 .agp_alloc_page = agp_generic_alloc_page,
216 .agp_destroy_page = ali_destroy_page, 216 .agp_destroy_page = ali_destroy_page,
217 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
217}; 218};
218 219
219static struct agp_bridge_driver ali_m1541_bridge = { 220static struct agp_bridge_driver ali_m1541_bridge = {
@@ -237,6 +238,7 @@ static struct agp_bridge_driver ali_m1541_bridge = {
237 .free_by_type = agp_generic_free_by_type, 238 .free_by_type = agp_generic_free_by_type,
238 .agp_alloc_page = m1541_alloc_page, 239 .agp_alloc_page = m1541_alloc_page,
239 .agp_destroy_page = m1541_destroy_page, 240 .agp_destroy_page = m1541_destroy_page,
241 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
240}; 242};
241 243
242 244
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index b4e00a343da9..b0acf41c0db9 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -91,6 +91,9 @@ static int alpha_core_agp_insert_memory(struct agp_memory *mem, off_t pg_start,
91 int num_entries, status; 91 int num_entries, status;
92 void *temp; 92 void *temp;
93 93
94 if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
95 return -EINVAL;
96
94 temp = agp_bridge->current_size; 97 temp = agp_bridge->current_size;
95 num_entries = A_SIZE_FIX(temp)->num_entries; 98 num_entries = A_SIZE_FIX(temp)->num_entries;
96 if ((pg_start + mem->page_count) > num_entries) 99 if ((pg_start + mem->page_count) > num_entries)
@@ -142,6 +145,7 @@ struct agp_bridge_driver alpha_core_agp_driver = {
142 .free_by_type = agp_generic_free_by_type, 145 .free_by_type = agp_generic_free_by_type,
143 .agp_alloc_page = agp_generic_alloc_page, 146 .agp_alloc_page = agp_generic_alloc_page,
144 .agp_destroy_page = agp_generic_destroy_page, 147 .agp_destroy_page = agp_generic_destroy_page,
148 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
145}; 149};
146 150
147struct agp_bridge_data *alpha_bridge; 151struct agp_bridge_data *alpha_bridge;
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index c85c8cadb6df..3d8d448bf394 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -381,6 +381,7 @@ static struct agp_bridge_driver amd_irongate_driver = {
381 .free_by_type = agp_generic_free_by_type, 381 .free_by_type = agp_generic_free_by_type,
382 .agp_alloc_page = agp_generic_alloc_page, 382 .agp_alloc_page = agp_generic_alloc_page,
383 .agp_destroy_page = agp_generic_destroy_page, 383 .agp_destroy_page = agp_generic_destroy_page,
384 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
384}; 385};
385 386
386static struct agp_device_ids amd_agp_device_ids[] __devinitdata = 387static struct agp_device_ids amd_agp_device_ids[] __devinitdata =
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 93d2209fee4c..636d984ed4a6 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -62,12 +62,18 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
62{ 62{
63 int i, j, num_entries; 63 int i, j, num_entries;
64 long long tmp; 64 long long tmp;
65 int mask_type;
66 struct agp_bridge_data *bridge = mem->bridge;
65 u32 pte; 67 u32 pte;
66 68
67 num_entries = agp_num_entries(); 69 num_entries = agp_num_entries();
68 70
69 if (type != 0 || mem->type != 0) 71 if (type != mem->type)
70 return -EINVAL; 72 return -EINVAL;
73 mask_type = bridge->driver->agp_type_to_mask_type(bridge, type);
74 if (mask_type != 0)
75 return -EINVAL;
76
71 77
72 /* Make sure we can fit the range in the gatt table. */ 78 /* Make sure we can fit the range in the gatt table. */
73 /* FIXME: could wrap */ 79 /* FIXME: could wrap */
@@ -90,7 +96,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
90 96
91 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { 97 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
92 tmp = agp_bridge->driver->mask_memory(agp_bridge, 98 tmp = agp_bridge->driver->mask_memory(agp_bridge,
93 mem->memory[i], mem->type); 99 mem->memory[i], mask_type);
94 100
95 BUG_ON(tmp & 0xffffff0000000ffcULL); 101 BUG_ON(tmp & 0xffffff0000000ffcULL);
96 pte = (tmp & 0x000000ff00000000ULL) >> 28; 102 pte = (tmp & 0x000000ff00000000ULL) >> 28;
@@ -247,6 +253,7 @@ static struct agp_bridge_driver amd_8151_driver = {
247 .free_by_type = agp_generic_free_by_type, 253 .free_by_type = agp_generic_free_by_type,
248 .agp_alloc_page = agp_generic_alloc_page, 254 .agp_alloc_page = agp_generic_alloc_page,
249 .agp_destroy_page = agp_generic_destroy_page, 255 .agp_destroy_page = agp_generic_destroy_page,
256 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
250}; 257};
251 258
252/* Some basic sanity checks for the aperture. */ 259/* Some basic sanity checks for the aperture. */
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index 9987dc2e0c3f..77c9ad68fba9 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -431,6 +431,7 @@ static struct agp_bridge_driver ati_generic_bridge = {
431 .free_by_type = agp_generic_free_by_type, 431 .free_by_type = agp_generic_free_by_type,
432 .agp_alloc_page = agp_generic_alloc_page, 432 .agp_alloc_page = agp_generic_alloc_page,
433 .agp_destroy_page = agp_generic_destroy_page, 433 .agp_destroy_page = agp_generic_destroy_page,
434 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
434}; 435};
435 436
436 437
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index d59e037ddd12..ebdd6dd66edb 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -43,7 +43,7 @@
43 * fix some real stupidity. It's only by chance we can bump 43 * fix some real stupidity. It's only by chance we can bump
44 * past 0.99 at all due to some boolean logic error. */ 44 * past 0.99 at all due to some boolean logic error. */
45#define AGPGART_VERSION_MAJOR 0 45#define AGPGART_VERSION_MAJOR 0
46#define AGPGART_VERSION_MINOR 101 46#define AGPGART_VERSION_MINOR 102
47static const struct agp_version agp_current_version = 47static const struct agp_version agp_current_version =
48{ 48{
49 .major = AGPGART_VERSION_MAJOR, 49 .major = AGPGART_VERSION_MAJOR,
diff --git a/drivers/char/agp/compat_ioctl.c b/drivers/char/agp/compat_ioctl.c
new file mode 100644
index 000000000000..fcb4b1bf0d4e
--- /dev/null
+++ b/drivers/char/agp/compat_ioctl.c
@@ -0,0 +1,282 @@
1/*
2 * AGPGART driver frontend compatibility ioctls
3 * Copyright (C) 2004 Silicon Graphics, Inc.
4 * Copyright (C) 2002-2003 Dave Jones
5 * Copyright (C) 1999 Jeff Hartmann
6 * Copyright (C) 1999 Precision Insight, Inc.
7 * Copyright (C) 1999 Xi Graphics, Inc.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * JEFF HARTMANN, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
23 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
24 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
25 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 */
28
29#include <linux/kernel.h>
30#include <linux/pci.h>
31#include <linux/agpgart.h>
32#include <asm/uaccess.h>
33#include "agp.h"
34#include "compat_ioctl.h"
35
36static int compat_agpioc_info_wrap(struct agp_file_private *priv, void __user *arg)
37{
38 struct agp_info32 userinfo;
39 struct agp_kern_info kerninfo;
40
41 agp_copy_info(agp_bridge, &kerninfo);
42
43 userinfo.version.major = kerninfo.version.major;
44 userinfo.version.minor = kerninfo.version.minor;
45 userinfo.bridge_id = kerninfo.device->vendor |
46 (kerninfo.device->device << 16);
47 userinfo.agp_mode = kerninfo.mode;
48 userinfo.aper_base = (compat_long_t)kerninfo.aper_base;
49 userinfo.aper_size = kerninfo.aper_size;
50 userinfo.pg_total = userinfo.pg_system = kerninfo.max_memory;
51 userinfo.pg_used = kerninfo.current_memory;
52
53 if (copy_to_user(arg, &userinfo, sizeof(userinfo)))
54 return -EFAULT;
55
56 return 0;
57}
58
59static int compat_agpioc_reserve_wrap(struct agp_file_private *priv, void __user *arg)
60{
61 struct agp_region32 ureserve;
62 struct agp_region kreserve;
63 struct agp_client *client;
64 struct agp_file_private *client_priv;
65
66 DBG("");
67 if (copy_from_user(&ureserve, arg, sizeof(ureserve)))
68 return -EFAULT;
69
70 if ((unsigned) ureserve.seg_count >= ~0U/sizeof(struct agp_segment32))
71 return -EFAULT;
72
73 kreserve.pid = ureserve.pid;
74 kreserve.seg_count = ureserve.seg_count;
75
76 client = agp_find_client_by_pid(kreserve.pid);
77
78 if (kreserve.seg_count == 0) {
79 /* remove a client */
80 client_priv = agp_find_private(kreserve.pid);
81
82 if (client_priv != NULL) {
83 set_bit(AGP_FF_IS_CLIENT, &client_priv->access_flags);
84 set_bit(AGP_FF_IS_VALID, &client_priv->access_flags);
85 }
86 if (client == NULL) {
87 /* client is already removed */
88 return 0;
89 }
90 return agp_remove_client(kreserve.pid);
91 } else {
92 struct agp_segment32 *usegment;
93 struct agp_segment *ksegment;
94 int seg;
95
96 if (ureserve.seg_count >= 16384)
97 return -EINVAL;
98
99 usegment = kmalloc(sizeof(*usegment) * ureserve.seg_count, GFP_KERNEL);
100 if (!usegment)
101 return -ENOMEM;
102
103 ksegment = kmalloc(sizeof(*ksegment) * kreserve.seg_count, GFP_KERNEL);
104 if (!ksegment) {
105 kfree(usegment);
106 return -ENOMEM;
107 }
108
109 if (copy_from_user(usegment, (void __user *) ureserve.seg_list,
110 sizeof(*usegment) * ureserve.seg_count)) {
111 kfree(usegment);
112 kfree(ksegment);
113 return -EFAULT;
114 }
115
116 for (seg = 0; seg < ureserve.seg_count; seg++) {
117 ksegment[seg].pg_start = usegment[seg].pg_start;
118 ksegment[seg].pg_count = usegment[seg].pg_count;
119 ksegment[seg].prot = usegment[seg].prot;
120 }
121
122 kfree(usegment);
123 kreserve.seg_list = ksegment;
124
125 if (client == NULL) {
126 /* Create the client and add the segment */
127 client = agp_create_client(kreserve.pid);
128
129 if (client == NULL) {
130 kfree(ksegment);
131 return -ENOMEM;
132 }
133 client_priv = agp_find_private(kreserve.pid);
134
135 if (client_priv != NULL) {
136 set_bit(AGP_FF_IS_CLIENT, &client_priv->access_flags);
137 set_bit(AGP_FF_IS_VALID, &client_priv->access_flags);
138 }
139 }
140 return agp_create_segment(client, &kreserve);
141 }
142 /* Will never really happen */
143 return -EINVAL;
144}
145
146static int compat_agpioc_allocate_wrap(struct agp_file_private *priv, void __user *arg)
147{
148 struct agp_memory *memory;
149 struct agp_allocate32 alloc;
150
151 DBG("");
152 if (copy_from_user(&alloc, arg, sizeof(alloc)))
153 return -EFAULT;
154
155 memory = agp_allocate_memory_wrap(alloc.pg_count, alloc.type);
156
157 if (memory == NULL)
158 return -ENOMEM;
159
160 alloc.key = memory->key;
161 alloc.physical = memory->physical;
162
163 if (copy_to_user(arg, &alloc, sizeof(alloc))) {
164 agp_free_memory_wrap(memory);
165 return -EFAULT;
166 }
167 return 0;
168}
169
170static int compat_agpioc_bind_wrap(struct agp_file_private *priv, void __user *arg)
171{
172 struct agp_bind32 bind_info;
173 struct agp_memory *memory;
174
175 DBG("");
176 if (copy_from_user(&bind_info, arg, sizeof(bind_info)))
177 return -EFAULT;
178
179 memory = agp_find_mem_by_key(bind_info.key);
180
181 if (memory == NULL)
182 return -EINVAL;
183
184 return agp_bind_memory(memory, bind_info.pg_start);
185}
186
187static int compat_agpioc_unbind_wrap(struct agp_file_private *priv, void __user *arg)
188{
189 struct agp_memory *memory;
190 struct agp_unbind32 unbind;
191
192 DBG("");
193 if (copy_from_user(&unbind, arg, sizeof(unbind)))
194 return -EFAULT;
195
196 memory = agp_find_mem_by_key(unbind.key);
197
198 if (memory == NULL)
199 return -EINVAL;
200
201 return agp_unbind_memory(memory);
202}
203
204long compat_agp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
205{
206 struct agp_file_private *curr_priv = file->private_data;
207 int ret_val = -ENOTTY;
208
209 mutex_lock(&(agp_fe.agp_mutex));
210
211 if ((agp_fe.current_controller == NULL) &&
212 (cmd != AGPIOC_ACQUIRE32)) {
213 ret_val = -EINVAL;
214 goto ioctl_out;
215 }
216 if ((agp_fe.backend_acquired != TRUE) &&
217 (cmd != AGPIOC_ACQUIRE32)) {
218 ret_val = -EBUSY;
219 goto ioctl_out;
220 }
221 if (cmd != AGPIOC_ACQUIRE32) {
222 if (!(test_bit(AGP_FF_IS_CONTROLLER, &curr_priv->access_flags))) {
223 ret_val = -EPERM;
224 goto ioctl_out;
225 }
226 /* Use the original pid of the controller,
227 * in case it's threaded */
228
229 if (agp_fe.current_controller->pid != curr_priv->my_pid) {
230 ret_val = -EBUSY;
231 goto ioctl_out;
232 }
233 }
234
235 switch (cmd) {
236 case AGPIOC_INFO32:
237 ret_val = compat_agpioc_info_wrap(curr_priv, (void __user *) arg);
238 break;
239
240 case AGPIOC_ACQUIRE32:
241 ret_val = agpioc_acquire_wrap(curr_priv);
242 break;
243
244 case AGPIOC_RELEASE32:
245 ret_val = agpioc_release_wrap(curr_priv);
246 break;
247
248 case AGPIOC_SETUP32:
249 ret_val = agpioc_setup_wrap(curr_priv, (void __user *) arg);
250 break;
251
252 case AGPIOC_RESERVE32:
253 ret_val = compat_agpioc_reserve_wrap(curr_priv, (void __user *) arg);
254 break;
255
256 case AGPIOC_PROTECT32:
257 ret_val = agpioc_protect_wrap(curr_priv);
258 break;
259
260 case AGPIOC_ALLOCATE32:
261 ret_val = compat_agpioc_allocate_wrap(curr_priv, (void __user *) arg);
262 break;
263
264 case AGPIOC_DEALLOCATE32:
265 ret_val = agpioc_deallocate_wrap(curr_priv, (int) arg);
266 break;
267
268 case AGPIOC_BIND32:
269 ret_val = compat_agpioc_bind_wrap(curr_priv, (void __user *) arg);
270 break;
271
272 case AGPIOC_UNBIND32:
273 ret_val = compat_agpioc_unbind_wrap(curr_priv, (void __user *) arg);
274 break;
275 }
276
277ioctl_out:
278 DBG("ioctl returns %d\n", ret_val);
279 mutex_unlock(&(agp_fe.agp_mutex));
280 return ret_val;
281}
282
diff --git a/drivers/char/agp/compat_ioctl.h b/drivers/char/agp/compat_ioctl.h
new file mode 100644
index 000000000000..71939d637236
--- /dev/null
+++ b/drivers/char/agp/compat_ioctl.h
@@ -0,0 +1,105 @@
1/*
2 * Copyright (C) 1999 Jeff Hartmann
3 * Copyright (C) 1999 Precision Insight, Inc.
4 * Copyright (C) 1999 Xi Graphics, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * JEFF HARTMANN, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
22 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26#ifndef _AGP_COMPAT_IOCTL_H
27#define _AGP_COMPAT_IOCTL_H
28
29#include <linux/compat.h>
30#include <linux/agpgart.h>
31
32#define AGPIOC_INFO32 _IOR (AGPIOC_BASE, 0, compat_uptr_t)
33#define AGPIOC_ACQUIRE32 _IO (AGPIOC_BASE, 1)
34#define AGPIOC_RELEASE32 _IO (AGPIOC_BASE, 2)
35#define AGPIOC_SETUP32 _IOW (AGPIOC_BASE, 3, compat_uptr_t)
36#define AGPIOC_RESERVE32 _IOW (AGPIOC_BASE, 4, compat_uptr_t)
37#define AGPIOC_PROTECT32 _IOW (AGPIOC_BASE, 5, compat_uptr_t)
38#define AGPIOC_ALLOCATE32 _IOWR(AGPIOC_BASE, 6, compat_uptr_t)
39#define AGPIOC_DEALLOCATE32 _IOW (AGPIOC_BASE, 7, compat_int_t)
40#define AGPIOC_BIND32 _IOW (AGPIOC_BASE, 8, compat_uptr_t)
41#define AGPIOC_UNBIND32 _IOW (AGPIOC_BASE, 9, compat_uptr_t)
42
43struct agp_info32 {
44 struct agp_version version; /* version of the driver */
45 u32 bridge_id; /* bridge vendor/device */
46 u32 agp_mode; /* mode info of bridge */
47 compat_long_t aper_base; /* base of aperture */
48 compat_size_t aper_size; /* size of aperture */
49 compat_size_t pg_total; /* max pages (swap + system) */
50 compat_size_t pg_system; /* max pages (system) */
51 compat_size_t pg_used; /* current pages used */
52};
53
54/*
55 * The "prot" down below needs still a "sleep" flag somehow ...
56 */
57struct agp_segment32 {
58 compat_off_t pg_start; /* starting page to populate */
59 compat_size_t pg_count; /* number of pages */
60 compat_int_t prot; /* prot flags for mmap */
61};
62
63struct agp_region32 {
64 compat_pid_t pid; /* pid of process */
65 compat_size_t seg_count; /* number of segments */
66 struct agp_segment32 *seg_list;
67};
68
69struct agp_allocate32 {
70 compat_int_t key; /* tag of allocation */
71 compat_size_t pg_count; /* number of pages */
72 u32 type; /* 0 == normal, other devspec */
73 u32 physical; /* device specific (some devices
74 * need a phys address of the
75 * actual page behind the gatt
76 * table) */
77};
78
79struct agp_bind32 {
80 compat_int_t key; /* tag of allocation */
81 compat_off_t pg_start; /* starting page to populate */
82};
83
84struct agp_unbind32 {
85 compat_int_t key; /* tag of allocation */
86 u32 priority; /* priority for paging out */
87};
88
89extern struct agp_front_data agp_fe;
90
91int agpioc_acquire_wrap(struct agp_file_private *priv);
92int agpioc_release_wrap(struct agp_file_private *priv);
93int agpioc_protect_wrap(struct agp_file_private *priv);
94int agpioc_setup_wrap(struct agp_file_private *priv, void __user *arg);
95int agpioc_deallocate_wrap(struct agp_file_private *priv, int arg);
96struct agp_file_private *agp_find_private(pid_t pid);
97struct agp_client *agp_create_client(pid_t id);
98int agp_remove_client(pid_t id);
99int agp_create_segment(struct agp_client *client, struct agp_region *region);
100void agp_free_memory_wrap(struct agp_memory *memory);
101struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type);
102struct agp_memory *agp_find_mem_by_key(int key);
103struct agp_client *agp_find_client_by_pid(pid_t id);
104
105#endif /* _AGP_COMPAT_H */
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 30f730ff81c1..658cb1a72d2c 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -335,6 +335,7 @@ static struct agp_bridge_driver efficeon_driver = {
335 .free_by_type = agp_generic_free_by_type, 335 .free_by_type = agp_generic_free_by_type,
336 .agp_alloc_page = agp_generic_alloc_page, 336 .agp_alloc_page = agp_generic_alloc_page,
337 .agp_destroy_page = agp_generic_destroy_page, 337 .agp_destroy_page = agp_generic_destroy_page,
338 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
338}; 339};
339 340
340static int __devinit agp_efficeon_probe(struct pci_dev *pdev, 341static int __devinit agp_efficeon_probe(struct pci_dev *pdev,
diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c
index 0f2ed2aa2d81..679d7f972439 100644
--- a/drivers/char/agp/frontend.c
+++ b/drivers/char/agp/frontend.c
@@ -41,9 +41,9 @@
41#include <asm/pgtable.h> 41#include <asm/pgtable.h>
42#include "agp.h" 42#include "agp.h"
43 43
44static struct agp_front_data agp_fe; 44struct agp_front_data agp_fe;
45 45
46static struct agp_memory *agp_find_mem_by_key(int key) 46struct agp_memory *agp_find_mem_by_key(int key)
47{ 47{
48 struct agp_memory *curr; 48 struct agp_memory *curr;
49 49
@@ -159,7 +159,7 @@ static pgprot_t agp_convert_mmap_flags(int prot)
159 return vm_get_page_prot(prot_bits); 159 return vm_get_page_prot(prot_bits);
160} 160}
161 161
162static int agp_create_segment(struct agp_client *client, struct agp_region *region) 162int agp_create_segment(struct agp_client *client, struct agp_region *region)
163{ 163{
164 struct agp_segment_priv **ret_seg; 164 struct agp_segment_priv **ret_seg;
165 struct agp_segment_priv *seg; 165 struct agp_segment_priv *seg;
@@ -211,7 +211,7 @@ static void agp_insert_into_pool(struct agp_memory * temp)
211 211
212/* File private list routines */ 212/* File private list routines */
213 213
214static struct agp_file_private *agp_find_private(pid_t pid) 214struct agp_file_private *agp_find_private(pid_t pid)
215{ 215{
216 struct agp_file_private *curr; 216 struct agp_file_private *curr;
217 217
@@ -266,13 +266,13 @@ static void agp_remove_file_private(struct agp_file_private * priv)
266 * Wrappers for agp_free_memory & agp_allocate_memory 266 * Wrappers for agp_free_memory & agp_allocate_memory
267 * These make sure that internal lists are kept updated. 267 * These make sure that internal lists are kept updated.
268 */ 268 */
269static void agp_free_memory_wrap(struct agp_memory *memory) 269void agp_free_memory_wrap(struct agp_memory *memory)
270{ 270{
271 agp_remove_from_pool(memory); 271 agp_remove_from_pool(memory);
272 agp_free_memory(memory); 272 agp_free_memory(memory);
273} 273}
274 274
275static struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type) 275struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type)
276{ 276{
277 struct agp_memory *memory; 277 struct agp_memory *memory;
278 278
@@ -484,7 +484,7 @@ static struct agp_controller *agp_find_controller_for_client(pid_t id)
484 return NULL; 484 return NULL;
485} 485}
486 486
487static struct agp_client *agp_find_client_by_pid(pid_t id) 487struct agp_client *agp_find_client_by_pid(pid_t id)
488{ 488{
489 struct agp_client *temp; 489 struct agp_client *temp;
490 490
@@ -509,7 +509,7 @@ static void agp_insert_client(struct agp_client *client)
509 agp_fe.current_controller->num_clients++; 509 agp_fe.current_controller->num_clients++;
510} 510}
511 511
512static struct agp_client *agp_create_client(pid_t id) 512struct agp_client *agp_create_client(pid_t id)
513{ 513{
514 struct agp_client *new_client; 514 struct agp_client *new_client;
515 515
@@ -522,7 +522,7 @@ static struct agp_client *agp_create_client(pid_t id)
522 return new_client; 522 return new_client;
523} 523}
524 524
525static int agp_remove_client(pid_t id) 525int agp_remove_client(pid_t id)
526{ 526{
527 struct agp_client *client; 527 struct agp_client *client;
528 struct agp_client *prev_client; 528 struct agp_client *prev_client;
@@ -746,7 +746,7 @@ static int agpioc_info_wrap(struct agp_file_private *priv, void __user *arg)
746 return 0; 746 return 0;
747} 747}
748 748
749static int agpioc_acquire_wrap(struct agp_file_private *priv) 749int agpioc_acquire_wrap(struct agp_file_private *priv)
750{ 750{
751 struct agp_controller *controller; 751 struct agp_controller *controller;
752 752
@@ -789,14 +789,14 @@ static int agpioc_acquire_wrap(struct agp_file_private *priv)
789 return 0; 789 return 0;
790} 790}
791 791
792static int agpioc_release_wrap(struct agp_file_private *priv) 792int agpioc_release_wrap(struct agp_file_private *priv)
793{ 793{
794 DBG(""); 794 DBG("");
795 agp_controller_release_current(agp_fe.current_controller, priv); 795 agp_controller_release_current(agp_fe.current_controller, priv);
796 return 0; 796 return 0;
797} 797}
798 798
799static int agpioc_setup_wrap(struct agp_file_private *priv, void __user *arg) 799int agpioc_setup_wrap(struct agp_file_private *priv, void __user *arg)
800{ 800{
801 struct agp_setup mode; 801 struct agp_setup mode;
802 802
@@ -876,7 +876,7 @@ static int agpioc_reserve_wrap(struct agp_file_private *priv, void __user *arg)
876 return -EINVAL; 876 return -EINVAL;
877} 877}
878 878
879static int agpioc_protect_wrap(struct agp_file_private *priv) 879int agpioc_protect_wrap(struct agp_file_private *priv)
880{ 880{
881 DBG(""); 881 DBG("");
882 /* This function is not currently implemented */ 882 /* This function is not currently implemented */
@@ -892,6 +892,9 @@ static int agpioc_allocate_wrap(struct agp_file_private *priv, void __user *arg)
892 if (copy_from_user(&alloc, arg, sizeof(struct agp_allocate))) 892 if (copy_from_user(&alloc, arg, sizeof(struct agp_allocate)))
893 return -EFAULT; 893 return -EFAULT;
894 894
895 if (alloc.type >= AGP_USER_TYPES)
896 return -EINVAL;
897
895 memory = agp_allocate_memory_wrap(alloc.pg_count, alloc.type); 898 memory = agp_allocate_memory_wrap(alloc.pg_count, alloc.type);
896 899
897 if (memory == NULL) 900 if (memory == NULL)
@@ -907,7 +910,7 @@ static int agpioc_allocate_wrap(struct agp_file_private *priv, void __user *arg)
907 return 0; 910 return 0;
908} 911}
909 912
910static int agpioc_deallocate_wrap(struct agp_file_private *priv, int arg) 913int agpioc_deallocate_wrap(struct agp_file_private *priv, int arg)
911{ 914{
912 struct agp_memory *memory; 915 struct agp_memory *memory;
913 916
@@ -1043,6 +1046,9 @@ static const struct file_operations agp_fops =
1043 .read = agp_read, 1046 .read = agp_read,
1044 .write = agp_write, 1047 .write = agp_write,
1045 .ioctl = agp_ioctl, 1048 .ioctl = agp_ioctl,
1049#ifdef CONFIG_COMPAT
1050 .compat_ioctl = compat_agp_ioctl,
1051#endif
1046 .mmap = agp_mmap, 1052 .mmap = agp_mmap,
1047 .open = agp_open, 1053 .open = agp_open,
1048 .release = agp_release, 1054 .release = agp_release,
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 3491d6f84bc6..7923337c3d26 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -101,6 +101,63 @@ static int agp_get_key(void)
101 return -1; 101 return -1;
102} 102}
103 103
104/*
105 * Use kmalloc if possible for the page list. Otherwise fall back to
106 * vmalloc. This speeds things up and also saves memory for small AGP
107 * regions.
108 */
109
110void agp_alloc_page_array(size_t size, struct agp_memory *mem)
111{
112 mem->memory = NULL;
113 mem->vmalloc_flag = 0;
114
115 if (size <= 2*PAGE_SIZE)
116 mem->memory = kmalloc(size, GFP_KERNEL | __GFP_NORETRY);
117 if (mem->memory == NULL) {
118 mem->memory = vmalloc(size);
119 mem->vmalloc_flag = 1;
120 }
121}
122EXPORT_SYMBOL(agp_alloc_page_array);
123
124void agp_free_page_array(struct agp_memory *mem)
125{
126 if (mem->vmalloc_flag) {
127 vfree(mem->memory);
128 } else {
129 kfree(mem->memory);
130 }
131}
132EXPORT_SYMBOL(agp_free_page_array);
133
134
135static struct agp_memory *agp_create_user_memory(unsigned long num_agp_pages)
136{
137 struct agp_memory *new;
138 unsigned long alloc_size = num_agp_pages*sizeof(struct page *);
139
140 new = kzalloc(sizeof(struct agp_memory), GFP_KERNEL);
141 if (new == NULL)
142 return NULL;
143
144 new->key = agp_get_key();
145
146 if (new->key < 0) {
147 kfree(new);
148 return NULL;
149 }
150
151 agp_alloc_page_array(alloc_size, new);
152
153 if (new->memory == NULL) {
154 agp_free_key(new->key);
155 kfree(new);
156 return NULL;
157 }
158 new->num_scratch_pages = 0;
159 return new;
160}
104 161
105struct agp_memory *agp_create_memory(int scratch_pages) 162struct agp_memory *agp_create_memory(int scratch_pages)
106{ 163{
@@ -116,7 +173,8 @@ struct agp_memory *agp_create_memory(int scratch_pages)
116 kfree(new); 173 kfree(new);
117 return NULL; 174 return NULL;
118 } 175 }
119 new->memory = vmalloc(PAGE_SIZE * scratch_pages); 176
177 agp_alloc_page_array(PAGE_SIZE * scratch_pages, new);
120 178
121 if (new->memory == NULL) { 179 if (new->memory == NULL) {
122 agp_free_key(new->key); 180 agp_free_key(new->key);
@@ -124,6 +182,7 @@ struct agp_memory *agp_create_memory(int scratch_pages)
124 return NULL; 182 return NULL;
125 } 183 }
126 new->num_scratch_pages = scratch_pages; 184 new->num_scratch_pages = scratch_pages;
185 new->type = AGP_NORMAL_MEMORY;
127 return new; 186 return new;
128} 187}
129EXPORT_SYMBOL(agp_create_memory); 188EXPORT_SYMBOL(agp_create_memory);
@@ -146,6 +205,11 @@ void agp_free_memory(struct agp_memory *curr)
146 if (curr->is_bound == TRUE) 205 if (curr->is_bound == TRUE)
147 agp_unbind_memory(curr); 206 agp_unbind_memory(curr);
148 207
208 if (curr->type >= AGP_USER_TYPES) {
209 agp_generic_free_by_type(curr);
210 return;
211 }
212
149 if (curr->type != 0) { 213 if (curr->type != 0) {
150 curr->bridge->driver->free_by_type(curr); 214 curr->bridge->driver->free_by_type(curr);
151 return; 215 return;
@@ -157,7 +221,7 @@ void agp_free_memory(struct agp_memory *curr)
157 flush_agp_mappings(); 221 flush_agp_mappings();
158 } 222 }
159 agp_free_key(curr->key); 223 agp_free_key(curr->key);
160 vfree(curr->memory); 224 agp_free_page_array(curr);
161 kfree(curr); 225 kfree(curr);
162} 226}
163EXPORT_SYMBOL(agp_free_memory); 227EXPORT_SYMBOL(agp_free_memory);
@@ -188,6 +252,13 @@ struct agp_memory *agp_allocate_memory(struct agp_bridge_data *bridge,
188 if ((atomic_read(&bridge->current_memory_agp) + page_count) > bridge->max_memory_agp) 252 if ((atomic_read(&bridge->current_memory_agp) + page_count) > bridge->max_memory_agp)
189 return NULL; 253 return NULL;
190 254
255 if (type >= AGP_USER_TYPES) {
256 new = agp_generic_alloc_user(page_count, type);
257 if (new)
258 new->bridge = bridge;
259 return new;
260 }
261
191 if (type != 0) { 262 if (type != 0) {
192 new = bridge->driver->alloc_by_type(page_count, type); 263 new = bridge->driver->alloc_by_type(page_count, type);
193 if (new) 264 if (new)
@@ -960,6 +1031,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
960 off_t j; 1031 off_t j;
961 void *temp; 1032 void *temp;
962 struct agp_bridge_data *bridge; 1033 struct agp_bridge_data *bridge;
1034 int mask_type;
963 1035
964 bridge = mem->bridge; 1036 bridge = mem->bridge;
965 if (!bridge) 1037 if (!bridge)
@@ -995,7 +1067,11 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
995 num_entries -= agp_memory_reserved/PAGE_SIZE; 1067 num_entries -= agp_memory_reserved/PAGE_SIZE;
996 if (num_entries < 0) num_entries = 0; 1068 if (num_entries < 0) num_entries = 0;
997 1069
998 if (type != 0 || mem->type != 0) { 1070 if (type != mem->type)
1071 return -EINVAL;
1072
1073 mask_type = bridge->driver->agp_type_to_mask_type(bridge, type);
1074 if (mask_type != 0) {
999 /* The generic routines know nothing of memory types */ 1075 /* The generic routines know nothing of memory types */
1000 return -EINVAL; 1076 return -EINVAL;
1001 } 1077 }
@@ -1018,7 +1094,8 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
1018 } 1094 }
1019 1095
1020 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { 1096 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
1021 writel(bridge->driver->mask_memory(bridge, mem->memory[i], mem->type), bridge->gatt_table+j); 1097 writel(bridge->driver->mask_memory(bridge, mem->memory[i], mask_type),
1098 bridge->gatt_table+j);
1022 } 1099 }
1023 readl(bridge->gatt_table+j-1); /* PCI Posting. */ 1100 readl(bridge->gatt_table+j-1); /* PCI Posting. */
1024 1101
@@ -1032,6 +1109,7 @@ int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type)
1032{ 1109{
1033 size_t i; 1110 size_t i;
1034 struct agp_bridge_data *bridge; 1111 struct agp_bridge_data *bridge;
1112 int mask_type;
1035 1113
1036 bridge = mem->bridge; 1114 bridge = mem->bridge;
1037 if (!bridge) 1115 if (!bridge)
@@ -1040,7 +1118,11 @@ int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type)
1040 if (mem->page_count == 0) 1118 if (mem->page_count == 0)
1041 return 0; 1119 return 0;
1042 1120
1043 if (type != 0 || mem->type != 0) { 1121 if (type != mem->type)
1122 return -EINVAL;
1123
1124 mask_type = bridge->driver->agp_type_to_mask_type(bridge, type);
1125 if (mask_type != 0) {
1044 /* The generic routines know nothing of memory types */ 1126 /* The generic routines know nothing of memory types */
1045 return -EINVAL; 1127 return -EINVAL;
1046 } 1128 }
@@ -1056,22 +1138,40 @@ int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type)
1056} 1138}
1057EXPORT_SYMBOL(agp_generic_remove_memory); 1139EXPORT_SYMBOL(agp_generic_remove_memory);
1058 1140
1059
1060struct agp_memory *agp_generic_alloc_by_type(size_t page_count, int type) 1141struct agp_memory *agp_generic_alloc_by_type(size_t page_count, int type)
1061{ 1142{
1062 return NULL; 1143 return NULL;
1063} 1144}
1064EXPORT_SYMBOL(agp_generic_alloc_by_type); 1145EXPORT_SYMBOL(agp_generic_alloc_by_type);
1065 1146
1066
1067void agp_generic_free_by_type(struct agp_memory *curr) 1147void agp_generic_free_by_type(struct agp_memory *curr)
1068{ 1148{
1069 vfree(curr->memory); 1149 agp_free_page_array(curr);
1070 agp_free_key(curr->key); 1150 agp_free_key(curr->key);
1071 kfree(curr); 1151 kfree(curr);
1072} 1152}
1073EXPORT_SYMBOL(agp_generic_free_by_type); 1153EXPORT_SYMBOL(agp_generic_free_by_type);
1074 1154
1155struct agp_memory *agp_generic_alloc_user(size_t page_count, int type)
1156{
1157 struct agp_memory *new;
1158 int i;
1159 int pages;
1160
1161 pages = (page_count + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
1162 new = agp_create_user_memory(page_count);
1163 if (new == NULL)
1164 return NULL;
1165
1166 for (i = 0; i < page_count; i++)
1167 new->memory[i] = 0;
1168 new->page_count = 0;
1169 new->type = type;
1170 new->num_scratch_pages = pages;
1171
1172 return new;
1173}
1174EXPORT_SYMBOL(agp_generic_alloc_user);
1075 1175
1076/* 1176/*
1077 * Basic Page Allocation Routines - 1177 * Basic Page Allocation Routines -
@@ -1165,6 +1265,15 @@ unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge,
1165} 1265}
1166EXPORT_SYMBOL(agp_generic_mask_memory); 1266EXPORT_SYMBOL(agp_generic_mask_memory);
1167 1267
1268int agp_generic_type_to_mask_type(struct agp_bridge_data *bridge,
1269 int type)
1270{
1271 if (type >= AGP_USER_TYPES)
1272 return 0;
1273 return type;
1274}
1275EXPORT_SYMBOL(agp_generic_type_to_mask_type);
1276
1168/* 1277/*
1169 * These functions are implemented according to the AGPv3 spec, 1278 * These functions are implemented according to the AGPv3 spec,
1170 * which covers implementation details that had previously been 1279 * which covers implementation details that had previously been
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
index 907fb66ec4a9..847deabf7f9b 100644
--- a/drivers/char/agp/hp-agp.c
+++ b/drivers/char/agp/hp-agp.c
@@ -438,6 +438,7 @@ struct agp_bridge_driver hp_zx1_driver = {
438 .free_by_type = agp_generic_free_by_type, 438 .free_by_type = agp_generic_free_by_type,
439 .agp_alloc_page = agp_generic_alloc_page, 439 .agp_alloc_page = agp_generic_alloc_page,
440 .agp_destroy_page = agp_generic_destroy_page, 440 .agp_destroy_page = agp_generic_destroy_page,
441 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
441 .cant_use_aperture = 1, 442 .cant_use_aperture = 1,
442}; 443};
443 444
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index 91769443d8fe..3e7618653abd 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -293,6 +293,9 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem,
293 pr_debug("i460_insert_memory_small_io_page(mem=%p, pg_start=%ld, type=%d, paddr0=0x%lx)\n", 293 pr_debug("i460_insert_memory_small_io_page(mem=%p, pg_start=%ld, type=%d, paddr0=0x%lx)\n",
294 mem, pg_start, type, mem->memory[0]); 294 mem, pg_start, type, mem->memory[0]);
295 295
296 if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
297 return -EINVAL;
298
296 io_pg_start = I460_IOPAGES_PER_KPAGE * pg_start; 299 io_pg_start = I460_IOPAGES_PER_KPAGE * pg_start;
297 300
298 temp = agp_bridge->current_size; 301 temp = agp_bridge->current_size;
@@ -396,6 +399,9 @@ static int i460_insert_memory_large_io_page (struct agp_memory *mem,
396 struct lp_desc *start, *end, *lp; 399 struct lp_desc *start, *end, *lp;
397 void *temp; 400 void *temp;
398 401
402 if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
403 return -EINVAL;
404
399 temp = agp_bridge->current_size; 405 temp = agp_bridge->current_size;
400 num_entries = A_SIZE_8(temp)->num_entries; 406 num_entries = A_SIZE_8(temp)->num_entries;
401 407
@@ -572,6 +578,7 @@ struct agp_bridge_driver intel_i460_driver = {
572#endif 578#endif
573 .alloc_by_type = agp_generic_alloc_by_type, 579 .alloc_by_type = agp_generic_alloc_by_type,
574 .free_by_type = agp_generic_free_by_type, 580 .free_by_type = agp_generic_free_by_type,
581 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
575 .cant_use_aperture = 1, 582 .cant_use_aperture = 1,
576}; 583};
577 584
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index a3011de51f7c..06b0bb6d982f 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/pci.h> 6#include <linux/pci.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/kernel.h>
8#include <linux/pagemap.h> 9#include <linux/pagemap.h>
9#include <linux/agp_backend.h> 10#include <linux/agp_backend.h>
10#include "agp.h" 11#include "agp.h"
@@ -24,6 +25,9 @@
24 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB) 25 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB)
25 26
26 27
28extern int agp_memory_reserved;
29
30
27/* Intel 815 register */ 31/* Intel 815 register */
28#define INTEL_815_APCONT 0x51 32#define INTEL_815_APCONT 0x51
29#define INTEL_815_ATTBASE_MASK ~0x1FFFFFFF 33#define INTEL_815_ATTBASE_MASK ~0x1FFFFFFF
@@ -68,12 +72,15 @@ static struct aper_size_info_fixed intel_i810_sizes[] =
68 72
69#define AGP_DCACHE_MEMORY 1 73#define AGP_DCACHE_MEMORY 1
70#define AGP_PHYS_MEMORY 2 74#define AGP_PHYS_MEMORY 2
75#define INTEL_AGP_CACHED_MEMORY 3
71 76
72static struct gatt_mask intel_i810_masks[] = 77static struct gatt_mask intel_i810_masks[] =
73{ 78{
74 {.mask = I810_PTE_VALID, .type = 0}, 79 {.mask = I810_PTE_VALID, .type = 0},
75 {.mask = (I810_PTE_VALID | I810_PTE_LOCAL), .type = AGP_DCACHE_MEMORY}, 80 {.mask = (I810_PTE_VALID | I810_PTE_LOCAL), .type = AGP_DCACHE_MEMORY},
76 {.mask = I810_PTE_VALID, .type = 0} 81 {.mask = I810_PTE_VALID, .type = 0},
82 {.mask = I810_PTE_VALID | I830_PTE_SYSTEM_CACHED,
83 .type = INTEL_AGP_CACHED_MEMORY}
77}; 84};
78 85
79static struct _intel_i810_private { 86static struct _intel_i810_private {
@@ -117,13 +124,15 @@ static int intel_i810_configure(void)
117 124
118 current_size = A_SIZE_FIX(agp_bridge->current_size); 125 current_size = A_SIZE_FIX(agp_bridge->current_size);
119 126
120 pci_read_config_dword(intel_i810_private.i810_dev, I810_MMADDR, &temp);
121 temp &= 0xfff80000;
122
123 intel_i810_private.registers = ioremap(temp, 128 * 4096);
124 if (!intel_i810_private.registers) { 127 if (!intel_i810_private.registers) {
125 printk(KERN_ERR PFX "Unable to remap memory.\n"); 128 pci_read_config_dword(intel_i810_private.i810_dev, I810_MMADDR, &temp);
126 return -ENOMEM; 129 temp &= 0xfff80000;
130
131 intel_i810_private.registers = ioremap(temp, 128 * 4096);
132 if (!intel_i810_private.registers) {
133 printk(KERN_ERR PFX "Unable to remap memory.\n");
134 return -ENOMEM;
135 }
127 } 136 }
128 137
129 if ((readl(intel_i810_private.registers+I810_DRAM_CTL) 138 if ((readl(intel_i810_private.registers+I810_DRAM_CTL)
@@ -201,62 +210,79 @@ static void i8xx_destroy_pages(void *addr)
201 atomic_dec(&agp_bridge->current_memory_agp); 210 atomic_dec(&agp_bridge->current_memory_agp);
202} 211}
203 212
213static int intel_i830_type_to_mask_type(struct agp_bridge_data *bridge,
214 int type)
215{
216 if (type < AGP_USER_TYPES)
217 return type;
218 else if (type == AGP_USER_CACHED_MEMORY)
219 return INTEL_AGP_CACHED_MEMORY;
220 else
221 return 0;
222}
223
204static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, 224static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start,
205 int type) 225 int type)
206{ 226{
207 int i, j, num_entries; 227 int i, j, num_entries;
208 void *temp; 228 void *temp;
229 int ret = -EINVAL;
230 int mask_type;
209 231
210 if (mem->page_count == 0) 232 if (mem->page_count == 0)
211 return 0; 233 goto out;
212 234
213 temp = agp_bridge->current_size; 235 temp = agp_bridge->current_size;
214 num_entries = A_SIZE_FIX(temp)->num_entries; 236 num_entries = A_SIZE_FIX(temp)->num_entries;
215 237
216 if ((pg_start + mem->page_count) > num_entries) 238 if ((pg_start + mem->page_count) > num_entries)
217 return -EINVAL; 239 goto out_err;
218 240
219 for (j = pg_start; j < (pg_start + mem->page_count); j++) {
220 if (!PGE_EMPTY(agp_bridge, readl(agp_bridge->gatt_table+j)))
221 return -EBUSY;
222 }
223 241
224 if (type != 0 || mem->type != 0) { 242 for (j = pg_start; j < (pg_start + mem->page_count); j++) {
225 if ((type == AGP_DCACHE_MEMORY) && (mem->type == AGP_DCACHE_MEMORY)) { 243 if (!PGE_EMPTY(agp_bridge, readl(agp_bridge->gatt_table+j))) {
226 /* special insert */ 244 ret = -EBUSY;
227 if (!mem->is_flushed) { 245 goto out_err;
228 global_cache_flush();
229 mem->is_flushed = TRUE;
230 }
231
232 for (i = pg_start; i < (pg_start + mem->page_count); i++) {
233 writel((i*4096)|I810_PTE_LOCAL|I810_PTE_VALID, intel_i810_private.registers+I810_PTE_BASE+(i*4));
234 }
235 readl(intel_i810_private.registers+I810_PTE_BASE+((i-1)*4)); /* PCI Posting. */
236
237 agp_bridge->driver->tlb_flush(mem);
238 return 0;
239 } 246 }
240 if ((type == AGP_PHYS_MEMORY) && (mem->type == AGP_PHYS_MEMORY))
241 goto insert;
242 return -EINVAL;
243 } 247 }
244 248
245insert: 249 if (type != mem->type)
246 if (!mem->is_flushed) { 250 goto out_err;
247 global_cache_flush();
248 mem->is_flushed = TRUE;
249 }
250 251
251 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { 252 mask_type = agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type);
252 writel(agp_bridge->driver->mask_memory(agp_bridge, 253
253 mem->memory[i], mem->type), 254 switch (mask_type) {
254 intel_i810_private.registers+I810_PTE_BASE+(j*4)); 255 case AGP_DCACHE_MEMORY:
256 if (!mem->is_flushed)
257 global_cache_flush();
258 for (i = pg_start; i < (pg_start + mem->page_count); i++) {
259 writel((i*4096)|I810_PTE_LOCAL|I810_PTE_VALID,
260 intel_i810_private.registers+I810_PTE_BASE+(i*4));
261 }
262 readl(intel_i810_private.registers+I810_PTE_BASE+((i-1)*4));
263 break;
264 case AGP_PHYS_MEMORY:
265 case AGP_NORMAL_MEMORY:
266 if (!mem->is_flushed)
267 global_cache_flush();
268 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
269 writel(agp_bridge->driver->mask_memory(agp_bridge,
270 mem->memory[i],
271 mask_type),
272 intel_i810_private.registers+I810_PTE_BASE+(j*4));
273 }
274 readl(intel_i810_private.registers+I810_PTE_BASE+((j-1)*4));
275 break;
276 default:
277 goto out_err;
255 } 278 }
256 readl(intel_i810_private.registers+I810_PTE_BASE+((j-1)*4)); /* PCI Posting. */
257 279
258 agp_bridge->driver->tlb_flush(mem); 280 agp_bridge->driver->tlb_flush(mem);
259 return 0; 281out:
282 ret = 0;
283out_err:
284 mem->is_flushed = 1;
285 return ret;
260} 286}
261 287
262static int intel_i810_remove_entries(struct agp_memory *mem, off_t pg_start, 288static int intel_i810_remove_entries(struct agp_memory *mem, off_t pg_start,
@@ -337,12 +363,11 @@ static struct agp_memory *intel_i810_alloc_by_type(size_t pg_count, int type)
337 new->type = AGP_DCACHE_MEMORY; 363 new->type = AGP_DCACHE_MEMORY;
338 new->page_count = pg_count; 364 new->page_count = pg_count;
339 new->num_scratch_pages = 0; 365 new->num_scratch_pages = 0;
340 vfree(new->memory); 366 agp_free_page_array(new);
341 return new; 367 return new;
342 } 368 }
343 if (type == AGP_PHYS_MEMORY) 369 if (type == AGP_PHYS_MEMORY)
344 return alloc_agpphysmem_i8xx(pg_count, type); 370 return alloc_agpphysmem_i8xx(pg_count, type);
345
346 return NULL; 371 return NULL;
347} 372}
348 373
@@ -357,7 +382,7 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
357 gart_to_virt(curr->memory[0])); 382 gart_to_virt(curr->memory[0]));
358 global_flush_tlb(); 383 global_flush_tlb();
359 } 384 }
360 vfree(curr->memory); 385 agp_free_page_array(curr);
361 } 386 }
362 kfree(curr); 387 kfree(curr);
363} 388}
@@ -619,9 +644,11 @@ static int intel_i830_insert_entries(struct agp_memory *mem,off_t pg_start, int
619{ 644{
620 int i,j,num_entries; 645 int i,j,num_entries;
621 void *temp; 646 void *temp;
647 int ret = -EINVAL;
648 int mask_type;
622 649
623 if (mem->page_count == 0) 650 if (mem->page_count == 0)
624 return 0; 651 goto out;
625 652
626 temp = agp_bridge->current_size; 653 temp = agp_bridge->current_size;
627 num_entries = A_SIZE_FIX(temp)->num_entries; 654 num_entries = A_SIZE_FIX(temp)->num_entries;
@@ -631,34 +658,41 @@ static int intel_i830_insert_entries(struct agp_memory *mem,off_t pg_start, int
631 pg_start,intel_i830_private.gtt_entries); 658 pg_start,intel_i830_private.gtt_entries);
632 659
633 printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n"); 660 printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n");
634 return -EINVAL; 661 goto out_err;
635 } 662 }
636 663
637 if ((pg_start + mem->page_count) > num_entries) 664 if ((pg_start + mem->page_count) > num_entries)
638 return -EINVAL; 665 goto out_err;
639 666
640 /* The i830 can't check the GTT for entries since its read only, 667 /* The i830 can't check the GTT for entries since its read only,
641 * depend on the caller to make the correct offset decisions. 668 * depend on the caller to make the correct offset decisions.
642 */ 669 */
643 670
644 if ((type != 0 && type != AGP_PHYS_MEMORY) || 671 if (type != mem->type)
645 (mem->type != 0 && mem->type != AGP_PHYS_MEMORY)) 672 goto out_err;
646 return -EINVAL; 673
674 mask_type = agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type);
647 675
648 if (!mem->is_flushed) { 676 if (mask_type != 0 && mask_type != AGP_PHYS_MEMORY &&
677 mask_type != INTEL_AGP_CACHED_MEMORY)
678 goto out_err;
679
680 if (!mem->is_flushed)
649 global_cache_flush(); 681 global_cache_flush();
650 mem->is_flushed = TRUE;
651 }
652 682
653 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { 683 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
654 writel(agp_bridge->driver->mask_memory(agp_bridge, 684 writel(agp_bridge->driver->mask_memory(agp_bridge,
655 mem->memory[i], mem->type), 685 mem->memory[i], mask_type),
656 intel_i830_private.registers+I810_PTE_BASE+(j*4)); 686 intel_i830_private.registers+I810_PTE_BASE+(j*4));
657 } 687 }
658 readl(intel_i830_private.registers+I810_PTE_BASE+((j-1)*4)); 688 readl(intel_i830_private.registers+I810_PTE_BASE+((j-1)*4));
659
660 agp_bridge->driver->tlb_flush(mem); 689 agp_bridge->driver->tlb_flush(mem);
661 return 0; 690
691out:
692 ret = 0;
693out_err:
694 mem->is_flushed = 1;
695 return ret;
662} 696}
663 697
664static int intel_i830_remove_entries(struct agp_memory *mem,off_t pg_start, 698static int intel_i830_remove_entries(struct agp_memory *mem,off_t pg_start,
@@ -687,7 +721,6 @@ static struct agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type)
687{ 721{
688 if (type == AGP_PHYS_MEMORY) 722 if (type == AGP_PHYS_MEMORY)
689 return alloc_agpphysmem_i8xx(pg_count, type); 723 return alloc_agpphysmem_i8xx(pg_count, type);
690
691 /* always return NULL for other allocation types for now */ 724 /* always return NULL for other allocation types for now */
692 return NULL; 725 return NULL;
693} 726}
@@ -734,9 +767,11 @@ static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
734{ 767{
735 int i,j,num_entries; 768 int i,j,num_entries;
736 void *temp; 769 void *temp;
770 int ret = -EINVAL;
771 int mask_type;
737 772
738 if (mem->page_count == 0) 773 if (mem->page_count == 0)
739 return 0; 774 goto out;
740 775
741 temp = agp_bridge->current_size; 776 temp = agp_bridge->current_size;
742 num_entries = A_SIZE_FIX(temp)->num_entries; 777 num_entries = A_SIZE_FIX(temp)->num_entries;
@@ -746,33 +781,41 @@ static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
746 pg_start,intel_i830_private.gtt_entries); 781 pg_start,intel_i830_private.gtt_entries);
747 782
748 printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n"); 783 printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n");
749 return -EINVAL; 784 goto out_err;
750 } 785 }
751 786
752 if ((pg_start + mem->page_count) > num_entries) 787 if ((pg_start + mem->page_count) > num_entries)
753 return -EINVAL; 788 goto out_err;
754 789
755 /* The i830 can't check the GTT for entries since its read only, 790 /* The i915 can't check the GTT for entries since its read only,
756 * depend on the caller to make the correct offset decisions. 791 * depend on the caller to make the correct offset decisions.
757 */ 792 */
758 793
759 if ((type != 0 && type != AGP_PHYS_MEMORY) || 794 if (type != mem->type)
760 (mem->type != 0 && mem->type != AGP_PHYS_MEMORY)) 795 goto out_err;
761 return -EINVAL; 796
797 mask_type = agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type);
762 798
763 if (!mem->is_flushed) { 799 if (mask_type != 0 && mask_type != AGP_PHYS_MEMORY &&
800 mask_type != INTEL_AGP_CACHED_MEMORY)
801 goto out_err;
802
803 if (!mem->is_flushed)
764 global_cache_flush(); 804 global_cache_flush();
765 mem->is_flushed = TRUE;
766 }
767 805
768 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { 806 for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
769 writel(agp_bridge->driver->mask_memory(agp_bridge, 807 writel(agp_bridge->driver->mask_memory(agp_bridge,
770 mem->memory[i], mem->type), intel_i830_private.gtt+j); 808 mem->memory[i], mask_type), intel_i830_private.gtt+j);
771 } 809 }
772 readl(intel_i830_private.gtt+j-1);
773 810
811 readl(intel_i830_private.gtt+j-1);
774 agp_bridge->driver->tlb_flush(mem); 812 agp_bridge->driver->tlb_flush(mem);
775 return 0; 813
814 out:
815 ret = 0;
816 out_err:
817 mem->is_flushed = 1;
818 return ret;
776} 819}
777 820
778static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start, 821static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start,
@@ -803,7 +846,7 @@ static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start,
803 */ 846 */
804static int intel_i9xx_fetch_size(void) 847static int intel_i9xx_fetch_size(void)
805{ 848{
806 int num_sizes = sizeof(intel_i830_sizes) / sizeof(*intel_i830_sizes); 849 int num_sizes = ARRAY_SIZE(intel_i830_sizes);
807 int aper_size; /* size in megabytes */ 850 int aper_size; /* size in megabytes */
808 int i; 851 int i;
809 852
@@ -1384,6 +1427,7 @@ static struct agp_bridge_driver intel_generic_driver = {
1384 .free_by_type = agp_generic_free_by_type, 1427 .free_by_type = agp_generic_free_by_type,
1385 .agp_alloc_page = agp_generic_alloc_page, 1428 .agp_alloc_page = agp_generic_alloc_page,
1386 .agp_destroy_page = agp_generic_destroy_page, 1429 .agp_destroy_page = agp_generic_destroy_page,
1430 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1387}; 1431};
1388 1432
1389static struct agp_bridge_driver intel_810_driver = { 1433static struct agp_bridge_driver intel_810_driver = {
@@ -1408,6 +1452,7 @@ static struct agp_bridge_driver intel_810_driver = {
1408 .free_by_type = intel_i810_free_by_type, 1452 .free_by_type = intel_i810_free_by_type,
1409 .agp_alloc_page = agp_generic_alloc_page, 1453 .agp_alloc_page = agp_generic_alloc_page,
1410 .agp_destroy_page = agp_generic_destroy_page, 1454 .agp_destroy_page = agp_generic_destroy_page,
1455 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1411}; 1456};
1412 1457
1413static struct agp_bridge_driver intel_815_driver = { 1458static struct agp_bridge_driver intel_815_driver = {
@@ -1431,6 +1476,7 @@ static struct agp_bridge_driver intel_815_driver = {
1431 .free_by_type = agp_generic_free_by_type, 1476 .free_by_type = agp_generic_free_by_type,
1432 .agp_alloc_page = agp_generic_alloc_page, 1477 .agp_alloc_page = agp_generic_alloc_page,
1433 .agp_destroy_page = agp_generic_destroy_page, 1478 .agp_destroy_page = agp_generic_destroy_page,
1479 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1434}; 1480};
1435 1481
1436static struct agp_bridge_driver intel_830_driver = { 1482static struct agp_bridge_driver intel_830_driver = {
@@ -1455,6 +1501,7 @@ static struct agp_bridge_driver intel_830_driver = {
1455 .free_by_type = intel_i810_free_by_type, 1501 .free_by_type = intel_i810_free_by_type,
1456 .agp_alloc_page = agp_generic_alloc_page, 1502 .agp_alloc_page = agp_generic_alloc_page,
1457 .agp_destroy_page = agp_generic_destroy_page, 1503 .agp_destroy_page = agp_generic_destroy_page,
1504 .agp_type_to_mask_type = intel_i830_type_to_mask_type,
1458}; 1505};
1459 1506
1460static struct agp_bridge_driver intel_820_driver = { 1507static struct agp_bridge_driver intel_820_driver = {
@@ -1478,6 +1525,7 @@ static struct agp_bridge_driver intel_820_driver = {
1478 .free_by_type = agp_generic_free_by_type, 1525 .free_by_type = agp_generic_free_by_type,
1479 .agp_alloc_page = agp_generic_alloc_page, 1526 .agp_alloc_page = agp_generic_alloc_page,
1480 .agp_destroy_page = agp_generic_destroy_page, 1527 .agp_destroy_page = agp_generic_destroy_page,
1528 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1481}; 1529};
1482 1530
1483static struct agp_bridge_driver intel_830mp_driver = { 1531static struct agp_bridge_driver intel_830mp_driver = {
@@ -1501,6 +1549,7 @@ static struct agp_bridge_driver intel_830mp_driver = {
1501 .free_by_type = agp_generic_free_by_type, 1549 .free_by_type = agp_generic_free_by_type,
1502 .agp_alloc_page = agp_generic_alloc_page, 1550 .agp_alloc_page = agp_generic_alloc_page,
1503 .agp_destroy_page = agp_generic_destroy_page, 1551 .agp_destroy_page = agp_generic_destroy_page,
1552 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1504}; 1553};
1505 1554
1506static struct agp_bridge_driver intel_840_driver = { 1555static struct agp_bridge_driver intel_840_driver = {
@@ -1524,6 +1573,7 @@ static struct agp_bridge_driver intel_840_driver = {
1524 .free_by_type = agp_generic_free_by_type, 1573 .free_by_type = agp_generic_free_by_type,
1525 .agp_alloc_page = agp_generic_alloc_page, 1574 .agp_alloc_page = agp_generic_alloc_page,
1526 .agp_destroy_page = agp_generic_destroy_page, 1575 .agp_destroy_page = agp_generic_destroy_page,
1576 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1527}; 1577};
1528 1578
1529static struct agp_bridge_driver intel_845_driver = { 1579static struct agp_bridge_driver intel_845_driver = {
@@ -1547,6 +1597,7 @@ static struct agp_bridge_driver intel_845_driver = {
1547 .free_by_type = agp_generic_free_by_type, 1597 .free_by_type = agp_generic_free_by_type,
1548 .agp_alloc_page = agp_generic_alloc_page, 1598 .agp_alloc_page = agp_generic_alloc_page,
1549 .agp_destroy_page = agp_generic_destroy_page, 1599 .agp_destroy_page = agp_generic_destroy_page,
1600 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1550}; 1601};
1551 1602
1552static struct agp_bridge_driver intel_850_driver = { 1603static struct agp_bridge_driver intel_850_driver = {
@@ -1570,6 +1621,7 @@ static struct agp_bridge_driver intel_850_driver = {
1570 .free_by_type = agp_generic_free_by_type, 1621 .free_by_type = agp_generic_free_by_type,
1571 .agp_alloc_page = agp_generic_alloc_page, 1622 .agp_alloc_page = agp_generic_alloc_page,
1572 .agp_destroy_page = agp_generic_destroy_page, 1623 .agp_destroy_page = agp_generic_destroy_page,
1624 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1573}; 1625};
1574 1626
1575static struct agp_bridge_driver intel_860_driver = { 1627static struct agp_bridge_driver intel_860_driver = {
@@ -1593,6 +1645,7 @@ static struct agp_bridge_driver intel_860_driver = {
1593 .free_by_type = agp_generic_free_by_type, 1645 .free_by_type = agp_generic_free_by_type,
1594 .agp_alloc_page = agp_generic_alloc_page, 1646 .agp_alloc_page = agp_generic_alloc_page,
1595 .agp_destroy_page = agp_generic_destroy_page, 1647 .agp_destroy_page = agp_generic_destroy_page,
1648 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1596}; 1649};
1597 1650
1598static struct agp_bridge_driver intel_915_driver = { 1651static struct agp_bridge_driver intel_915_driver = {
@@ -1617,6 +1670,7 @@ static struct agp_bridge_driver intel_915_driver = {
1617 .free_by_type = intel_i810_free_by_type, 1670 .free_by_type = intel_i810_free_by_type,
1618 .agp_alloc_page = agp_generic_alloc_page, 1671 .agp_alloc_page = agp_generic_alloc_page,
1619 .agp_destroy_page = agp_generic_destroy_page, 1672 .agp_destroy_page = agp_generic_destroy_page,
1673 .agp_type_to_mask_type = intel_i830_type_to_mask_type,
1620}; 1674};
1621 1675
1622static struct agp_bridge_driver intel_i965_driver = { 1676static struct agp_bridge_driver intel_i965_driver = {
@@ -1641,6 +1695,7 @@ static struct agp_bridge_driver intel_i965_driver = {
1641 .free_by_type = intel_i810_free_by_type, 1695 .free_by_type = intel_i810_free_by_type,
1642 .agp_alloc_page = agp_generic_alloc_page, 1696 .agp_alloc_page = agp_generic_alloc_page,
1643 .agp_destroy_page = agp_generic_destroy_page, 1697 .agp_destroy_page = agp_generic_destroy_page,
1698 .agp_type_to_mask_type = intel_i830_type_to_mask_type,
1644}; 1699};
1645 1700
1646static struct agp_bridge_driver intel_7505_driver = { 1701static struct agp_bridge_driver intel_7505_driver = {
@@ -1664,6 +1719,7 @@ static struct agp_bridge_driver intel_7505_driver = {
1664 .free_by_type = agp_generic_free_by_type, 1719 .free_by_type = agp_generic_free_by_type,
1665 .agp_alloc_page = agp_generic_alloc_page, 1720 .agp_alloc_page = agp_generic_alloc_page,
1666 .agp_destroy_page = agp_generic_destroy_page, 1721 .agp_destroy_page = agp_generic_destroy_page,
1722 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
1667}; 1723};
1668 1724
1669static int find_i810(u16 device) 1725static int find_i810(u16 device)
diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c
index df7f37b2739a..2563286b2fcf 100644
--- a/drivers/char/agp/nvidia-agp.c
+++ b/drivers/char/agp/nvidia-agp.c
@@ -310,6 +310,7 @@ static struct agp_bridge_driver nvidia_driver = {
310 .free_by_type = agp_generic_free_by_type, 310 .free_by_type = agp_generic_free_by_type,
311 .agp_alloc_page = agp_generic_alloc_page, 311 .agp_alloc_page = agp_generic_alloc_page,
312 .agp_destroy_page = agp_generic_destroy_page, 312 .agp_destroy_page = agp_generic_destroy_page,
313 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
313}; 314};
314 315
315static int __devinit agp_nvidia_probe(struct pci_dev *pdev, 316static int __devinit agp_nvidia_probe(struct pci_dev *pdev,
diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c
index 17c50b0f83f0..b7b4590673ae 100644
--- a/drivers/char/agp/parisc-agp.c
+++ b/drivers/char/agp/parisc-agp.c
@@ -228,6 +228,7 @@ struct agp_bridge_driver parisc_agp_driver = {
228 .free_by_type = agp_generic_free_by_type, 228 .free_by_type = agp_generic_free_by_type,
229 .agp_alloc_page = agp_generic_alloc_page, 229 .agp_alloc_page = agp_generic_alloc_page,
230 .agp_destroy_page = agp_generic_destroy_page, 230 .agp_destroy_page = agp_generic_destroy_page,
231 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
231 .cant_use_aperture = 1, 232 .cant_use_aperture = 1,
232}; 233};
233 234
diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c
index 902648db7efa..92d1dc45b9be 100644
--- a/drivers/char/agp/sgi-agp.c
+++ b/drivers/char/agp/sgi-agp.c
@@ -265,6 +265,7 @@ struct agp_bridge_driver sgi_tioca_driver = {
265 .free_by_type = agp_generic_free_by_type, 265 .free_by_type = agp_generic_free_by_type,
266 .agp_alloc_page = sgi_tioca_alloc_page, 266 .agp_alloc_page = sgi_tioca_alloc_page,
267 .agp_destroy_page = agp_generic_destroy_page, 267 .agp_destroy_page = agp_generic_destroy_page,
268 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
268 .cant_use_aperture = 1, 269 .cant_use_aperture = 1,
269 .needs_scratch_page = 0, 270 .needs_scratch_page = 0,
270 .num_aperture_sizes = 1, 271 .num_aperture_sizes = 1,
diff --git a/drivers/char/agp/sis-agp.c b/drivers/char/agp/sis-agp.c
index a00fd48a6f05..60342b708152 100644
--- a/drivers/char/agp/sis-agp.c
+++ b/drivers/char/agp/sis-agp.c
@@ -140,6 +140,7 @@ static struct agp_bridge_driver sis_driver = {
140 .free_by_type = agp_generic_free_by_type, 140 .free_by_type = agp_generic_free_by_type,
141 .agp_alloc_page = agp_generic_alloc_page, 141 .agp_alloc_page = agp_generic_alloc_page,
142 .agp_destroy_page = agp_generic_destroy_page, 142 .agp_destroy_page = agp_generic_destroy_page,
143 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
143}; 144};
144 145
145static struct agp_device_ids sis_agp_device_ids[] __devinitdata = 146static struct agp_device_ids sis_agp_device_ids[] __devinitdata =
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 4f2d7d99902f..9f5ae7714f85 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -444,6 +444,7 @@ static struct agp_bridge_driver sworks_driver = {
444 .free_by_type = agp_generic_free_by_type, 444 .free_by_type = agp_generic_free_by_type,
445 .agp_alloc_page = agp_generic_alloc_page, 445 .agp_alloc_page = agp_generic_alloc_page,
446 .agp_destroy_page = agp_generic_destroy_page, 446 .agp_destroy_page = agp_generic_destroy_page,
447 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
447}; 448};
448 449
449static int __devinit agp_serverworks_probe(struct pci_dev *pdev, 450static int __devinit agp_serverworks_probe(struct pci_dev *pdev,
diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c
index dffc19382f7e..6c45702e542c 100644
--- a/drivers/char/agp/uninorth-agp.c
+++ b/drivers/char/agp/uninorth-agp.c
@@ -510,6 +510,7 @@ struct agp_bridge_driver uninorth_agp_driver = {
510 .free_by_type = agp_generic_free_by_type, 510 .free_by_type = agp_generic_free_by_type,
511 .agp_alloc_page = agp_generic_alloc_page, 511 .agp_alloc_page = agp_generic_alloc_page,
512 .agp_destroy_page = agp_generic_destroy_page, 512 .agp_destroy_page = agp_generic_destroy_page,
513 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
513 .cant_use_aperture = 1, 514 .cant_use_aperture = 1,
514}; 515};
515 516
@@ -534,6 +535,7 @@ struct agp_bridge_driver u3_agp_driver = {
534 .free_by_type = agp_generic_free_by_type, 535 .free_by_type = agp_generic_free_by_type,
535 .agp_alloc_page = agp_generic_alloc_page, 536 .agp_alloc_page = agp_generic_alloc_page,
536 .agp_destroy_page = agp_generic_destroy_page, 537 .agp_destroy_page = agp_generic_destroy_page,
538 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
537 .cant_use_aperture = 1, 539 .cant_use_aperture = 1,
538 .needs_scratch_page = 1, 540 .needs_scratch_page = 1,
539}; 541};
diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c
index 2ded7a280d7f..2e7c04370cd9 100644
--- a/drivers/char/agp/via-agp.c
+++ b/drivers/char/agp/via-agp.c
@@ -191,6 +191,7 @@ static struct agp_bridge_driver via_agp3_driver = {
191 .free_by_type = agp_generic_free_by_type, 191 .free_by_type = agp_generic_free_by_type,
192 .agp_alloc_page = agp_generic_alloc_page, 192 .agp_alloc_page = agp_generic_alloc_page,
193 .agp_destroy_page = agp_generic_destroy_page, 193 .agp_destroy_page = agp_generic_destroy_page,
194 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
194}; 195};
195 196
196static struct agp_bridge_driver via_driver = { 197static struct agp_bridge_driver via_driver = {
@@ -214,6 +215,7 @@ static struct agp_bridge_driver via_driver = {
214 .free_by_type = agp_generic_free_by_type, 215 .free_by_type = agp_generic_free_by_type,
215 .agp_alloc_page = agp_generic_alloc_page, 216 .agp_alloc_page = agp_generic_alloc_page,
216 .agp_destroy_page = agp_generic_destroy_page, 217 .agp_destroy_page = agp_generic_destroy_page,
218 .agp_type_to_mask_type = agp_generic_type_to_mask_type,
217}; 219};
218 220
219static struct agp_device_ids via_agp_device_ids[] __devinitdata = 221static struct agp_device_ids via_agp_device_ids[] __devinitdata =
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 1aa93a752a9c..ae76a9ffe89f 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -117,7 +117,7 @@ __setup("hcheck_reboot", hangcheck_parse_reboot);
117__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 117__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
118#endif /* not MODULE */ 118#endif /* not MODULE */
119 119
120#if defined(CONFIG_X86_64) || defined(CONFIG_S390) 120#if defined(CONFIG_S390)
121# define HAVE_MONOTONIC 121# define HAVE_MONOTONIC
122# define TIMER_FREQ 1000000000ULL 122# define TIMER_FREQ 1000000000ULL
123#elif defined(CONFIG_IA64) 123#elif defined(CONFIG_IA64)
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index be73c80d699d..1d8c4ae61551 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -36,6 +36,7 @@
36#include <linux/workqueue.h> 36#include <linux/workqueue.h>
37#include <linux/kexec.h> 37#include <linux/kexec.h>
38#include <linux/irq.h> 38#include <linux/irq.h>
39#include <linux/hrtimer.h>
39 40
40#include <asm/ptrace.h> 41#include <asm/ptrace.h>
41#include <asm/irq_regs.h> 42#include <asm/irq_regs.h>
@@ -158,6 +159,17 @@ static struct sysrq_key_op sysrq_sync_op = {
158 .enable_mask = SYSRQ_ENABLE_SYNC, 159 .enable_mask = SYSRQ_ENABLE_SYNC,
159}; 160};
160 161
162static void sysrq_handle_show_timers(int key, struct tty_struct *tty)
163{
164 sysrq_timer_list_show();
165}
166
167static struct sysrq_key_op sysrq_show_timers_op = {
168 .handler = sysrq_handle_show_timers,
169 .help_msg = "show-all-timers(Q)",
170 .action_msg = "Show Pending Timers",
171};
172
161static void sysrq_handle_mountro(int key, struct tty_struct *tty) 173static void sysrq_handle_mountro(int key, struct tty_struct *tty)
162{ 174{
163 emergency_remount(); 175 emergency_remount();
@@ -335,7 +347,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
335 /* o: This will often be registered as 'Off' at init time */ 347 /* o: This will often be registered as 'Off' at init time */
336 NULL, /* o */ 348 NULL, /* o */
337 &sysrq_showregs_op, /* p */ 349 &sysrq_showregs_op, /* p */
338 NULL, /* q */ 350 &sysrq_show_timers_op, /* q */
339 &sysrq_unraw_op, /* r */ 351 &sysrq_unraw_op, /* r */
340 &sysrq_sync_op, /* s */ 352 &sysrq_sync_op, /* s */
341 &sysrq_showstate_op, /* t */ 353 &sysrq_showstate_op, /* t */
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index b6bcdbbf57b3..ccaa6a39cb4b 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -16,15 +16,13 @@
16 * This file is licensed under the GPL v2. 16 * This file is licensed under the GPL v2.
17 */ 17 */
18 18
19#include <linux/acpi_pmtmr.h>
19#include <linux/clocksource.h> 20#include <linux/clocksource.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/pci.h> 23#include <linux/pci.h>
23#include <asm/io.h> 24#include <asm/io.h>
24 25
25/* Number of PMTMR ticks expected during calibration run */
26#define PMTMR_TICKS_PER_SEC 3579545
27
28/* 26/*
29 * The I/O port the PMTMR resides at. 27 * The I/O port the PMTMR resides at.
30 * The location is detected during setup_arch(), 28 * The location is detected during setup_arch(),
@@ -32,15 +30,13 @@
32 */ 30 */
33u32 pmtmr_ioport __read_mostly; 31u32 pmtmr_ioport __read_mostly;
34 32
35#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */
36
37static inline u32 read_pmtmr(void) 33static inline u32 read_pmtmr(void)
38{ 34{
39 /* mask the output to 24 bits */ 35 /* mask the output to 24 bits */
40 return inl(pmtmr_ioport) & ACPI_PM_MASK; 36 return inl(pmtmr_ioport) & ACPI_PM_MASK;
41} 37}
42 38
43static cycle_t acpi_pm_read_verified(void) 39u32 acpi_pm_read_verified(void)
44{ 40{
45 u32 v1 = 0, v2 = 0, v3 = 0; 41 u32 v1 = 0, v2 = 0, v3 = 0;
46 42
@@ -57,7 +53,12 @@ static cycle_t acpi_pm_read_verified(void)
57 } while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) 53 } while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
58 || (v3 > v1 && v3 < v2))); 54 || (v3 > v1 && v3 < v2)));
59 55
60 return (cycle_t)v2; 56 return v2;
57}
58
59static cycle_t acpi_pm_read_slow(void)
60{
61 return (cycle_t)acpi_pm_read_verified();
61} 62}
62 63
63static cycle_t acpi_pm_read(void) 64static cycle_t acpi_pm_read(void)
@@ -72,7 +73,8 @@ static struct clocksource clocksource_acpi_pm = {
72 .mask = (cycle_t)ACPI_PM_MASK, 73 .mask = (cycle_t)ACPI_PM_MASK,
73 .mult = 0, /*to be caluclated*/ 74 .mult = 0, /*to be caluclated*/
74 .shift = 22, 75 .shift = 22,
75 .is_continuous = 1, 76 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
77
76}; 78};
77 79
78 80
@@ -87,7 +89,7 @@ __setup("acpi_pm_good", acpi_pm_good_setup);
87 89
88static inline void acpi_pm_need_workaround(void) 90static inline void acpi_pm_need_workaround(void)
89{ 91{
90 clocksource_acpi_pm.read = acpi_pm_read_verified; 92 clocksource_acpi_pm.read = acpi_pm_read_slow;
91 clocksource_acpi_pm.rating = 110; 93 clocksource_acpi_pm.rating = 110;
92} 94}
93 95
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index bf4d3d50d1c4..4f3925ceb360 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -31,7 +31,7 @@ static struct clocksource clocksource_cyclone = {
31 .mask = CYCLONE_TIMER_MASK, 31 .mask = CYCLONE_TIMER_MASK,
32 .mult = 10, 32 .mult = 10,
33 .shift = 0, 33 .shift = 0,
34 .is_continuous = 1, 34 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
35}; 35};
36 36
37static int __init init_cyclone_clocksource(void) 37static int __init init_cyclone_clocksource(void)
diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c
index 22915cc46ba7..b92da677aa5d 100644
--- a/drivers/clocksource/scx200_hrt.c
+++ b/drivers/clocksource/scx200_hrt.c
@@ -57,7 +57,7 @@ static struct clocksource cs_hrt = {
57 .rating = 250, 57 .rating = 250,
58 .read = read_hrt, 58 .read = read_hrt,
59 .mask = CLOCKSOURCE_MASK(32), 59 .mask = CLOCKSOURCE_MASK(32),
60 .is_continuous = 1, 60 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
61 /* mult, shift are set based on mhz27 flag */ 61 /* mult, shift are set based on mhz27 flag */
62}; 62};
63 63
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 491779af8d55..d155e81b5c97 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -16,7 +16,7 @@ config CPU_FREQ
16if CPU_FREQ 16if CPU_FREQ
17 17
18config CPU_FREQ_TABLE 18config CPU_FREQ_TABLE
19 def_tristate m 19 tristate
20 20
21config CPU_FREQ_DEBUG 21config CPU_FREQ_DEBUG
22 bool "Enable CPUfreq debugging" 22 bool "Enable CPUfreq debugging"
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index a45cc89e387a..f52facc570f5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -41,8 +41,67 @@ static struct cpufreq_driver *cpufreq_driver;
41static struct cpufreq_policy *cpufreq_cpu_data[NR_CPUS]; 41static struct cpufreq_policy *cpufreq_cpu_data[NR_CPUS];
42static DEFINE_SPINLOCK(cpufreq_driver_lock); 42static DEFINE_SPINLOCK(cpufreq_driver_lock);
43 43
44/*
45 * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
46 * all cpufreq/hotplug/workqueue/etc related lock issues.
47 *
48 * The rules for this semaphore:
49 * - Any routine that wants to read from the policy structure will
50 * do a down_read on this semaphore.
51 * - Any routine that will write to the policy structure and/or may take away
52 * the policy altogether (eg. CPU hotplug), will hold this lock in write
53 * mode before doing so.
54 *
55 * Additional rules:
56 * - All holders of the lock should check to make sure that the CPU they
57 * are concerned with are online after they get the lock.
58 * - Governor routines that can be called in cpufreq hotplug path should not
59 * take this sem as top level hotplug notifier handler takes this.
60 */
61static DEFINE_PER_CPU(int, policy_cpu);
62static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem);
63
64#define lock_policy_rwsem(mode, cpu) \
65int lock_policy_rwsem_##mode \
66(int cpu) \
67{ \
68 int policy_cpu = per_cpu(policy_cpu, cpu); \
69 BUG_ON(policy_cpu == -1); \
70 down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \
71 if (unlikely(!cpu_online(cpu))) { \
72 up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \
73 return -1; \
74 } \
75 \
76 return 0; \
77}
78
79lock_policy_rwsem(read, cpu);
80EXPORT_SYMBOL_GPL(lock_policy_rwsem_read);
81
82lock_policy_rwsem(write, cpu);
83EXPORT_SYMBOL_GPL(lock_policy_rwsem_write);
84
85void unlock_policy_rwsem_read(int cpu)
86{
87 int policy_cpu = per_cpu(policy_cpu, cpu);
88 BUG_ON(policy_cpu == -1);
89 up_read(&per_cpu(cpu_policy_rwsem, policy_cpu));
90}
91EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read);
92
93void unlock_policy_rwsem_write(int cpu)
94{
95 int policy_cpu = per_cpu(policy_cpu, cpu);
96 BUG_ON(policy_cpu == -1);
97 up_write(&per_cpu(cpu_policy_rwsem, policy_cpu));
98}
99EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write);
100
101
44/* internal prototypes */ 102/* internal prototypes */
45static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event); 103static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
104static unsigned int __cpufreq_get(unsigned int cpu);
46static void handle_update(struct work_struct *work); 105static void handle_update(struct work_struct *work);
47 106
48/** 107/**
@@ -415,12 +474,8 @@ static ssize_t store_##file_name \
415 if (ret != 1) \ 474 if (ret != 1) \
416 return -EINVAL; \ 475 return -EINVAL; \
417 \ 476 \
418 lock_cpu_hotplug(); \
419 mutex_lock(&policy->lock); \
420 ret = __cpufreq_set_policy(policy, &new_policy); \ 477 ret = __cpufreq_set_policy(policy, &new_policy); \
421 policy->user_policy.object = policy->object; \ 478 policy->user_policy.object = policy->object; \
422 mutex_unlock(&policy->lock); \
423 unlock_cpu_hotplug(); \
424 \ 479 \
425 return ret ? ret : count; \ 480 return ret ? ret : count; \
426} 481}
@@ -434,7 +489,7 @@ store_one(scaling_max_freq,max);
434static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy, 489static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy,
435 char *buf) 490 char *buf)
436{ 491{
437 unsigned int cur_freq = cpufreq_get(policy->cpu); 492 unsigned int cur_freq = __cpufreq_get(policy->cpu);
438 if (!cur_freq) 493 if (!cur_freq)
439 return sprintf(buf, "<unknown>"); 494 return sprintf(buf, "<unknown>");
440 return sprintf(buf, "%u\n", cur_freq); 495 return sprintf(buf, "%u\n", cur_freq);
@@ -479,18 +534,12 @@ static ssize_t store_scaling_governor (struct cpufreq_policy * policy,
479 &new_policy.governor)) 534 &new_policy.governor))
480 return -EINVAL; 535 return -EINVAL;
481 536
482 lock_cpu_hotplug();
483
484 /* Do not use cpufreq_set_policy here or the user_policy.max 537 /* Do not use cpufreq_set_policy here or the user_policy.max
485 will be wrongly overridden */ 538 will be wrongly overridden */
486 mutex_lock(&policy->lock);
487 ret = __cpufreq_set_policy(policy, &new_policy); 539 ret = __cpufreq_set_policy(policy, &new_policy);
488 540
489 policy->user_policy.policy = policy->policy; 541 policy->user_policy.policy = policy->policy;
490 policy->user_policy.governor = policy->governor; 542 policy->user_policy.governor = policy->governor;
491 mutex_unlock(&policy->lock);
492
493 unlock_cpu_hotplug();
494 543
495 if (ret) 544 if (ret)
496 return ret; 545 return ret;
@@ -595,11 +644,17 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf)
595 policy = cpufreq_cpu_get(policy->cpu); 644 policy = cpufreq_cpu_get(policy->cpu);
596 if (!policy) 645 if (!policy)
597 return -EINVAL; 646 return -EINVAL;
647
648 if (lock_policy_rwsem_read(policy->cpu) < 0)
649 return -EINVAL;
650
598 if (fattr->show) 651 if (fattr->show)
599 ret = fattr->show(policy, buf); 652 ret = fattr->show(policy, buf);
600 else 653 else
601 ret = -EIO; 654 ret = -EIO;
602 655
656 unlock_policy_rwsem_read(policy->cpu);
657
603 cpufreq_cpu_put(policy); 658 cpufreq_cpu_put(policy);
604 return ret; 659 return ret;
605} 660}
@@ -613,11 +668,17 @@ static ssize_t store(struct kobject * kobj, struct attribute * attr,
613 policy = cpufreq_cpu_get(policy->cpu); 668 policy = cpufreq_cpu_get(policy->cpu);
614 if (!policy) 669 if (!policy)
615 return -EINVAL; 670 return -EINVAL;
671
672 if (lock_policy_rwsem_write(policy->cpu) < 0)
673 return -EINVAL;
674
616 if (fattr->store) 675 if (fattr->store)
617 ret = fattr->store(policy, buf, count); 676 ret = fattr->store(policy, buf, count);
618 else 677 else
619 ret = -EIO; 678 ret = -EIO;
620 679
680 unlock_policy_rwsem_write(policy->cpu);
681
621 cpufreq_cpu_put(policy); 682 cpufreq_cpu_put(policy);
622 return ret; 683 return ret;
623} 684}
@@ -691,8 +752,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
691 policy->cpu = cpu; 752 policy->cpu = cpu;
692 policy->cpus = cpumask_of_cpu(cpu); 753 policy->cpus = cpumask_of_cpu(cpu);
693 754
694 mutex_init(&policy->lock); 755 /* Initially set CPU itself as the policy_cpu */
695 mutex_lock(&policy->lock); 756 per_cpu(policy_cpu, cpu) = cpu;
757 lock_policy_rwsem_write(cpu);
758
696 init_completion(&policy->kobj_unregister); 759 init_completion(&policy->kobj_unregister);
697 INIT_WORK(&policy->update, handle_update); 760 INIT_WORK(&policy->update, handle_update);
698 761
@@ -702,7 +765,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
702 ret = cpufreq_driver->init(policy); 765 ret = cpufreq_driver->init(policy);
703 if (ret) { 766 if (ret) {
704 dprintk("initialization failed\n"); 767 dprintk("initialization failed\n");
705 mutex_unlock(&policy->lock); 768 unlock_policy_rwsem_write(cpu);
706 goto err_out; 769 goto err_out;
707 } 770 }
708 771
@@ -716,6 +779,14 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
716 */ 779 */
717 managed_policy = cpufreq_cpu_get(j); 780 managed_policy = cpufreq_cpu_get(j);
718 if (unlikely(managed_policy)) { 781 if (unlikely(managed_policy)) {
782
783 /* Set proper policy_cpu */
784 unlock_policy_rwsem_write(cpu);
785 per_cpu(policy_cpu, cpu) = managed_policy->cpu;
786
787 if (lock_policy_rwsem_write(cpu) < 0)
788 goto err_out_driver_exit;
789
719 spin_lock_irqsave(&cpufreq_driver_lock, flags); 790 spin_lock_irqsave(&cpufreq_driver_lock, flags);
720 managed_policy->cpus = policy->cpus; 791 managed_policy->cpus = policy->cpus;
721 cpufreq_cpu_data[cpu] = managed_policy; 792 cpufreq_cpu_data[cpu] = managed_policy;
@@ -726,13 +797,13 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
726 &managed_policy->kobj, 797 &managed_policy->kobj,
727 "cpufreq"); 798 "cpufreq");
728 if (ret) { 799 if (ret) {
729 mutex_unlock(&policy->lock); 800 unlock_policy_rwsem_write(cpu);
730 goto err_out_driver_exit; 801 goto err_out_driver_exit;
731 } 802 }
732 803
733 cpufreq_debug_enable_ratelimit(); 804 cpufreq_debug_enable_ratelimit();
734 mutex_unlock(&policy->lock);
735 ret = 0; 805 ret = 0;
806 unlock_policy_rwsem_write(cpu);
736 goto err_out_driver_exit; /* call driver->exit() */ 807 goto err_out_driver_exit; /* call driver->exit() */
737 } 808 }
738 } 809 }
@@ -746,7 +817,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
746 817
747 ret = kobject_register(&policy->kobj); 818 ret = kobject_register(&policy->kobj);
748 if (ret) { 819 if (ret) {
749 mutex_unlock(&policy->lock); 820 unlock_policy_rwsem_write(cpu);
750 goto err_out_driver_exit; 821 goto err_out_driver_exit;
751 } 822 }
752 /* set up files for this cpu device */ 823 /* set up files for this cpu device */
@@ -761,8 +832,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
761 sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); 832 sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
762 833
763 spin_lock_irqsave(&cpufreq_driver_lock, flags); 834 spin_lock_irqsave(&cpufreq_driver_lock, flags);
764 for_each_cpu_mask(j, policy->cpus) 835 for_each_cpu_mask(j, policy->cpus) {
765 cpufreq_cpu_data[j] = policy; 836 cpufreq_cpu_data[j] = policy;
837 per_cpu(policy_cpu, j) = policy->cpu;
838 }
766 spin_unlock_irqrestore(&cpufreq_driver_lock, flags); 839 spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
767 840
768 /* symlink affected CPUs */ 841 /* symlink affected CPUs */
@@ -778,14 +851,14 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
778 ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj, 851 ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
779 "cpufreq"); 852 "cpufreq");
780 if (ret) { 853 if (ret) {
781 mutex_unlock(&policy->lock); 854 unlock_policy_rwsem_write(cpu);
782 goto err_out_unregister; 855 goto err_out_unregister;
783 } 856 }
784 } 857 }
785 858
786 policy->governor = NULL; /* to assure that the starting sequence is 859 policy->governor = NULL; /* to assure that the starting sequence is
787 * run in cpufreq_set_policy */ 860 * run in cpufreq_set_policy */
788 mutex_unlock(&policy->lock); 861 unlock_policy_rwsem_write(cpu);
789 862
790 /* set default policy */ 863 /* set default policy */
791 ret = cpufreq_set_policy(&new_policy); 864 ret = cpufreq_set_policy(&new_policy);
@@ -826,11 +899,13 @@ module_out:
826 899
827 900
828/** 901/**
829 * cpufreq_remove_dev - remove a CPU device 902 * __cpufreq_remove_dev - remove a CPU device
830 * 903 *
831 * Removes the cpufreq interface for a CPU device. 904 * Removes the cpufreq interface for a CPU device.
905 * Caller should already have policy_rwsem in write mode for this CPU.
906 * This routine frees the rwsem before returning.
832 */ 907 */
833static int cpufreq_remove_dev (struct sys_device * sys_dev) 908static int __cpufreq_remove_dev (struct sys_device * sys_dev)
834{ 909{
835 unsigned int cpu = sys_dev->id; 910 unsigned int cpu = sys_dev->id;
836 unsigned long flags; 911 unsigned long flags;
@@ -849,6 +924,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
849 if (!data) { 924 if (!data) {
850 spin_unlock_irqrestore(&cpufreq_driver_lock, flags); 925 spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
851 cpufreq_debug_enable_ratelimit(); 926 cpufreq_debug_enable_ratelimit();
927 unlock_policy_rwsem_write(cpu);
852 return -EINVAL; 928 return -EINVAL;
853 } 929 }
854 cpufreq_cpu_data[cpu] = NULL; 930 cpufreq_cpu_data[cpu] = NULL;
@@ -865,6 +941,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
865 sysfs_remove_link(&sys_dev->kobj, "cpufreq"); 941 sysfs_remove_link(&sys_dev->kobj, "cpufreq");
866 cpufreq_cpu_put(data); 942 cpufreq_cpu_put(data);
867 cpufreq_debug_enable_ratelimit(); 943 cpufreq_debug_enable_ratelimit();
944 unlock_policy_rwsem_write(cpu);
868 return 0; 945 return 0;
869 } 946 }
870#endif 947#endif
@@ -873,6 +950,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
873 if (!kobject_get(&data->kobj)) { 950 if (!kobject_get(&data->kobj)) {
874 spin_unlock_irqrestore(&cpufreq_driver_lock, flags); 951 spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
875 cpufreq_debug_enable_ratelimit(); 952 cpufreq_debug_enable_ratelimit();
953 unlock_policy_rwsem_write(cpu);
876 return -EFAULT; 954 return -EFAULT;
877 } 955 }
878 956
@@ -906,10 +984,10 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
906 spin_unlock_irqrestore(&cpufreq_driver_lock, flags); 984 spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
907#endif 985#endif
908 986
909 mutex_lock(&data->lock);
910 if (cpufreq_driver->target) 987 if (cpufreq_driver->target)
911 __cpufreq_governor(data, CPUFREQ_GOV_STOP); 988 __cpufreq_governor(data, CPUFREQ_GOV_STOP);
912 mutex_unlock(&data->lock); 989
990 unlock_policy_rwsem_write(cpu);
913 991
914 kobject_unregister(&data->kobj); 992 kobject_unregister(&data->kobj);
915 993
@@ -933,6 +1011,18 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
933} 1011}
934 1012
935 1013
1014static int cpufreq_remove_dev (struct sys_device * sys_dev)
1015{
1016 unsigned int cpu = sys_dev->id;
1017 int retval;
1018 if (unlikely(lock_policy_rwsem_write(cpu)))
1019 BUG();
1020
1021 retval = __cpufreq_remove_dev(sys_dev);
1022 return retval;
1023}
1024
1025
936static void handle_update(struct work_struct *work) 1026static void handle_update(struct work_struct *work)
937{ 1027{
938 struct cpufreq_policy *policy = 1028 struct cpufreq_policy *policy =
@@ -980,9 +1070,12 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
980 unsigned int ret_freq = 0; 1070 unsigned int ret_freq = 0;
981 1071
982 if (policy) { 1072 if (policy) {
983 mutex_lock(&policy->lock); 1073 if (unlikely(lock_policy_rwsem_read(cpu)))
1074 return ret_freq;
1075
984 ret_freq = policy->cur; 1076 ret_freq = policy->cur;
985 mutex_unlock(&policy->lock); 1077
1078 unlock_policy_rwsem_read(cpu);
986 cpufreq_cpu_put(policy); 1079 cpufreq_cpu_put(policy);
987 } 1080 }
988 1081
@@ -991,24 +1084,13 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
991EXPORT_SYMBOL(cpufreq_quick_get); 1084EXPORT_SYMBOL(cpufreq_quick_get);
992 1085
993 1086
994/** 1087static unsigned int __cpufreq_get(unsigned int cpu)
995 * cpufreq_get - get the current CPU frequency (in kHz)
996 * @cpu: CPU number
997 *
998 * Get the CPU current (static) CPU frequency
999 */
1000unsigned int cpufreq_get(unsigned int cpu)
1001{ 1088{
1002 struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); 1089 struct cpufreq_policy *policy = cpufreq_cpu_data[cpu];
1003 unsigned int ret_freq = 0; 1090 unsigned int ret_freq = 0;
1004 1091
1005 if (!policy)
1006 return 0;
1007
1008 if (!cpufreq_driver->get) 1092 if (!cpufreq_driver->get)
1009 goto out; 1093 return (ret_freq);
1010
1011 mutex_lock(&policy->lock);
1012 1094
1013 ret_freq = cpufreq_driver->get(cpu); 1095 ret_freq = cpufreq_driver->get(cpu);
1014 1096
@@ -1022,11 +1104,33 @@ unsigned int cpufreq_get(unsigned int cpu)
1022 } 1104 }
1023 } 1105 }
1024 1106
1025 mutex_unlock(&policy->lock); 1107 return (ret_freq);
1108}
1026 1109
1027out: 1110/**
1028 cpufreq_cpu_put(policy); 1111 * cpufreq_get - get the current CPU frequency (in kHz)
1112 * @cpu: CPU number
1113 *
1114 * Get the CPU current (static) CPU frequency
1115 */
1116unsigned int cpufreq_get(unsigned int cpu)
1117{
1118 unsigned int ret_freq = 0;
1119 struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
1120
1121 if (!policy)
1122 goto out;
1123
1124 if (unlikely(lock_policy_rwsem_read(cpu)))
1125 goto out_policy;
1126
1127 ret_freq = __cpufreq_get(cpu);
1029 1128
1129 unlock_policy_rwsem_read(cpu);
1130
1131out_policy:
1132 cpufreq_cpu_put(policy);
1133out:
1030 return (ret_freq); 1134 return (ret_freq);
1031} 1135}
1032EXPORT_SYMBOL(cpufreq_get); 1136EXPORT_SYMBOL(cpufreq_get);
@@ -1278,7 +1382,6 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier);
1278 *********************************************************************/ 1382 *********************************************************************/
1279 1383
1280 1384
1281/* Must be called with lock_cpu_hotplug held */
1282int __cpufreq_driver_target(struct cpufreq_policy *policy, 1385int __cpufreq_driver_target(struct cpufreq_policy *policy,
1283 unsigned int target_freq, 1386 unsigned int target_freq,
1284 unsigned int relation) 1387 unsigned int relation)
@@ -1304,20 +1407,19 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
1304 if (!policy) 1407 if (!policy)
1305 return -EINVAL; 1408 return -EINVAL;
1306 1409
1307 lock_cpu_hotplug(); 1410 if (unlikely(lock_policy_rwsem_write(policy->cpu)))
1308 mutex_lock(&policy->lock); 1411 return -EINVAL;
1309 1412
1310 ret = __cpufreq_driver_target(policy, target_freq, relation); 1413 ret = __cpufreq_driver_target(policy, target_freq, relation);
1311 1414
1312 mutex_unlock(&policy->lock); 1415 unlock_policy_rwsem_write(policy->cpu);
1313 unlock_cpu_hotplug();
1314 1416
1315 cpufreq_cpu_put(policy); 1417 cpufreq_cpu_put(policy);
1316 return ret; 1418 return ret;
1317} 1419}
1318EXPORT_SYMBOL_GPL(cpufreq_driver_target); 1420EXPORT_SYMBOL_GPL(cpufreq_driver_target);
1319 1421
1320int cpufreq_driver_getavg(struct cpufreq_policy *policy) 1422int __cpufreq_driver_getavg(struct cpufreq_policy *policy)
1321{ 1423{
1322 int ret = 0; 1424 int ret = 0;
1323 1425
@@ -1325,20 +1427,15 @@ int cpufreq_driver_getavg(struct cpufreq_policy *policy)
1325 if (!policy) 1427 if (!policy)
1326 return -EINVAL; 1428 return -EINVAL;
1327 1429
1328 mutex_lock(&policy->lock);
1329
1330 if (cpu_online(policy->cpu) && cpufreq_driver->getavg) 1430 if (cpu_online(policy->cpu) && cpufreq_driver->getavg)
1331 ret = cpufreq_driver->getavg(policy->cpu); 1431 ret = cpufreq_driver->getavg(policy->cpu);
1332 1432
1333 mutex_unlock(&policy->lock);
1334
1335 cpufreq_cpu_put(policy); 1433 cpufreq_cpu_put(policy);
1336 return ret; 1434 return ret;
1337} 1435}
1338EXPORT_SYMBOL_GPL(cpufreq_driver_getavg); 1436EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg);
1339 1437
1340/* 1438/*
1341 * Locking: Must be called with the lock_cpu_hotplug() lock held
1342 * when "event" is CPUFREQ_GOV_LIMITS 1439 * when "event" is CPUFREQ_GOV_LIMITS
1343 */ 1440 */
1344 1441
@@ -1420,9 +1517,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
1420 if (!cpu_policy) 1517 if (!cpu_policy)
1421 return -EINVAL; 1518 return -EINVAL;
1422 1519
1423 mutex_lock(&cpu_policy->lock);
1424 memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy)); 1520 memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy));
1425 mutex_unlock(&cpu_policy->lock);
1426 1521
1427 cpufreq_cpu_put(cpu_policy); 1522 cpufreq_cpu_put(cpu_policy);
1428 return 0; 1523 return 0;
@@ -1433,7 +1528,6 @@ EXPORT_SYMBOL(cpufreq_get_policy);
1433/* 1528/*
1434 * data : current policy. 1529 * data : current policy.
1435 * policy : policy to be set. 1530 * policy : policy to be set.
1436 * Locking: Must be called with the lock_cpu_hotplug() lock held
1437 */ 1531 */
1438static int __cpufreq_set_policy(struct cpufreq_policy *data, 1532static int __cpufreq_set_policy(struct cpufreq_policy *data,
1439 struct cpufreq_policy *policy) 1533 struct cpufreq_policy *policy)
@@ -1539,10 +1633,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy)
1539 if (!data) 1633 if (!data)
1540 return -EINVAL; 1634 return -EINVAL;
1541 1635
1542 lock_cpu_hotplug(); 1636 if (unlikely(lock_policy_rwsem_write(policy->cpu)))
1637 return -EINVAL;
1543 1638
1544 /* lock this CPU */
1545 mutex_lock(&data->lock);
1546 1639
1547 ret = __cpufreq_set_policy(data, policy); 1640 ret = __cpufreq_set_policy(data, policy);
1548 data->user_policy.min = data->min; 1641 data->user_policy.min = data->min;
@@ -1550,9 +1643,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy)
1550 data->user_policy.policy = data->policy; 1643 data->user_policy.policy = data->policy;
1551 data->user_policy.governor = data->governor; 1644 data->user_policy.governor = data->governor;
1552 1645
1553 mutex_unlock(&data->lock); 1646 unlock_policy_rwsem_write(policy->cpu);
1554 1647
1555 unlock_cpu_hotplug();
1556 cpufreq_cpu_put(data); 1648 cpufreq_cpu_put(data);
1557 1649
1558 return ret; 1650 return ret;
@@ -1576,8 +1668,8 @@ int cpufreq_update_policy(unsigned int cpu)
1576 if (!data) 1668 if (!data)
1577 return -ENODEV; 1669 return -ENODEV;
1578 1670
1579 lock_cpu_hotplug(); 1671 if (unlikely(lock_policy_rwsem_write(cpu)))
1580 mutex_lock(&data->lock); 1672 return -EINVAL;
1581 1673
1582 dprintk("updating policy for CPU %u\n", cpu); 1674 dprintk("updating policy for CPU %u\n", cpu);
1583 memcpy(&policy, data, sizeof(struct cpufreq_policy)); 1675 memcpy(&policy, data, sizeof(struct cpufreq_policy));
@@ -1602,8 +1694,8 @@ int cpufreq_update_policy(unsigned int cpu)
1602 1694
1603 ret = __cpufreq_set_policy(data, &policy); 1695 ret = __cpufreq_set_policy(data, &policy);
1604 1696
1605 mutex_unlock(&data->lock); 1697 unlock_policy_rwsem_write(cpu);
1606 unlock_cpu_hotplug(); 1698
1607 cpufreq_cpu_put(data); 1699 cpufreq_cpu_put(data);
1608 return ret; 1700 return ret;
1609} 1701}
@@ -1613,31 +1705,28 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
1613 unsigned long action, void *hcpu) 1705 unsigned long action, void *hcpu)
1614{ 1706{
1615 unsigned int cpu = (unsigned long)hcpu; 1707 unsigned int cpu = (unsigned long)hcpu;
1616 struct cpufreq_policy *policy;
1617 struct sys_device *sys_dev; 1708 struct sys_device *sys_dev;
1709 struct cpufreq_policy *policy;
1618 1710
1619 sys_dev = get_cpu_sysdev(cpu); 1711 sys_dev = get_cpu_sysdev(cpu);
1620
1621 if (sys_dev) { 1712 if (sys_dev) {
1622 switch (action) { 1713 switch (action) {
1623 case CPU_ONLINE: 1714 case CPU_ONLINE:
1624 cpufreq_add_dev(sys_dev); 1715 cpufreq_add_dev(sys_dev);
1625 break; 1716 break;
1626 case CPU_DOWN_PREPARE: 1717 case CPU_DOWN_PREPARE:
1627 /* 1718 if (unlikely(lock_policy_rwsem_write(cpu)))
1628 * We attempt to put this cpu in lowest frequency 1719 BUG();
1629 * possible before going down. This will permit 1720
1630 * hardware-managed P-State to switch other related
1631 * threads to min or higher speeds if possible.
1632 */
1633 policy = cpufreq_cpu_data[cpu]; 1721 policy = cpufreq_cpu_data[cpu];
1634 if (policy) { 1722 if (policy) {
1635 cpufreq_driver_target(policy, policy->min, 1723 __cpufreq_driver_target(policy, policy->min,
1636 CPUFREQ_RELATION_H); 1724 CPUFREQ_RELATION_H);
1637 } 1725 }
1726 __cpufreq_remove_dev(sys_dev);
1638 break; 1727 break;
1639 case CPU_DEAD: 1728 case CPU_DOWN_FAILED:
1640 cpufreq_remove_dev(sys_dev); 1729 cpufreq_add_dev(sys_dev);
1641 break; 1730 break;
1642 } 1731 }
1643 } 1732 }
@@ -1751,3 +1840,16 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
1751 return 0; 1840 return 0;
1752} 1841}
1753EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); 1842EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
1843
1844static int __init cpufreq_core_init(void)
1845{
1846 int cpu;
1847
1848 for_each_possible_cpu(cpu) {
1849 per_cpu(policy_cpu, cpu) = -1;
1850 init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
1851 }
1852 return 0;
1853}
1854
1855core_initcall(cpufreq_core_init);
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 05d6c22ba07c..26f440ccc3fb 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -429,14 +429,12 @@ static void dbs_check_cpu(int cpu)
429static void do_dbs_timer(struct work_struct *work) 429static void do_dbs_timer(struct work_struct *work)
430{ 430{
431 int i; 431 int i;
432 lock_cpu_hotplug();
433 mutex_lock(&dbs_mutex); 432 mutex_lock(&dbs_mutex);
434 for_each_online_cpu(i) 433 for_each_online_cpu(i)
435 dbs_check_cpu(i); 434 dbs_check_cpu(i);
436 schedule_delayed_work(&dbs_work, 435 schedule_delayed_work(&dbs_work,
437 usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); 436 usecs_to_jiffies(dbs_tuners_ins.sampling_rate));
438 mutex_unlock(&dbs_mutex); 437 mutex_unlock(&dbs_mutex);
439 unlock_cpu_hotplug();
440} 438}
441 439
442static inline void dbs_timer_init(void) 440static inline void dbs_timer_init(void)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index f697449327c6..d60bcb9d14cc 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -52,19 +52,20 @@ static unsigned int def_sampling_rate;
52static void do_dbs_timer(struct work_struct *work); 52static void do_dbs_timer(struct work_struct *work);
53 53
54/* Sampling types */ 54/* Sampling types */
55enum dbs_sample {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; 55enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
56 56
57struct cpu_dbs_info_s { 57struct cpu_dbs_info_s {
58 cputime64_t prev_cpu_idle; 58 cputime64_t prev_cpu_idle;
59 cputime64_t prev_cpu_wall; 59 cputime64_t prev_cpu_wall;
60 struct cpufreq_policy *cur_policy; 60 struct cpufreq_policy *cur_policy;
61 struct delayed_work work; 61 struct delayed_work work;
62 enum dbs_sample sample_type;
63 unsigned int enable;
64 struct cpufreq_frequency_table *freq_table; 62 struct cpufreq_frequency_table *freq_table;
65 unsigned int freq_lo; 63 unsigned int freq_lo;
66 unsigned int freq_lo_jiffies; 64 unsigned int freq_lo_jiffies;
67 unsigned int freq_hi_jiffies; 65 unsigned int freq_hi_jiffies;
66 int cpu;
67 unsigned int enable:1,
68 sample_type:1;
68}; 69};
69static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); 70static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
70 71
@@ -402,7 +403,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
402 if (load < (dbs_tuners_ins.up_threshold - 10)) { 403 if (load < (dbs_tuners_ins.up_threshold - 10)) {
403 unsigned int freq_next, freq_cur; 404 unsigned int freq_next, freq_cur;
404 405
405 freq_cur = cpufreq_driver_getavg(policy); 406 freq_cur = __cpufreq_driver_getavg(policy);
406 if (!freq_cur) 407 if (!freq_cur)
407 freq_cur = policy->cur; 408 freq_cur = policy->cur;
408 409
@@ -423,9 +424,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
423 424
424static void do_dbs_timer(struct work_struct *work) 425static void do_dbs_timer(struct work_struct *work)
425{ 426{
426 unsigned int cpu = smp_processor_id(); 427 struct cpu_dbs_info_s *dbs_info =
427 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); 428 container_of(work, struct cpu_dbs_info_s, work.work);
428 enum dbs_sample sample_type = dbs_info->sample_type; 429 unsigned int cpu = dbs_info->cpu;
430 int sample_type = dbs_info->sample_type;
431
429 /* We want all CPUs to do sampling nearly on same jiffy */ 432 /* We want all CPUs to do sampling nearly on same jiffy */
430 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 433 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
431 434
@@ -434,15 +437,19 @@ static void do_dbs_timer(struct work_struct *work)
434 437
435 delay -= jiffies % delay; 438 delay -= jiffies % delay;
436 439
437 if (!dbs_info->enable) 440 if (lock_policy_rwsem_write(cpu) < 0)
441 return;
442
443 if (!dbs_info->enable) {
444 unlock_policy_rwsem_write(cpu);
438 return; 445 return;
446 }
447
439 /* Common NORMAL_SAMPLE setup */ 448 /* Common NORMAL_SAMPLE setup */
440 dbs_info->sample_type = DBS_NORMAL_SAMPLE; 449 dbs_info->sample_type = DBS_NORMAL_SAMPLE;
441 if (!dbs_tuners_ins.powersave_bias || 450 if (!dbs_tuners_ins.powersave_bias ||
442 sample_type == DBS_NORMAL_SAMPLE) { 451 sample_type == DBS_NORMAL_SAMPLE) {
443 lock_cpu_hotplug();
444 dbs_check_cpu(dbs_info); 452 dbs_check_cpu(dbs_info);
445 unlock_cpu_hotplug();
446 if (dbs_info->freq_lo) { 453 if (dbs_info->freq_lo) {
447 /* Setup timer for SUB_SAMPLE */ 454 /* Setup timer for SUB_SAMPLE */
448 dbs_info->sample_type = DBS_SUB_SAMPLE; 455 dbs_info->sample_type = DBS_SUB_SAMPLE;
@@ -454,26 +461,27 @@ static void do_dbs_timer(struct work_struct *work)
454 CPUFREQ_RELATION_H); 461 CPUFREQ_RELATION_H);
455 } 462 }
456 queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); 463 queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
464 unlock_policy_rwsem_write(cpu);
457} 465}
458 466
459static inline void dbs_timer_init(unsigned int cpu) 467static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
460{ 468{
461 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
462 /* We want all CPUs to do sampling nearly on same jiffy */ 469 /* We want all CPUs to do sampling nearly on same jiffy */
463 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 470 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
464 delay -= jiffies % delay; 471 delay -= jiffies % delay;
465 472
473 dbs_info->enable = 1;
466 ondemand_powersave_bias_init(); 474 ondemand_powersave_bias_init();
467 INIT_DELAYED_WORK_NAR(&dbs_info->work, do_dbs_timer);
468 dbs_info->sample_type = DBS_NORMAL_SAMPLE; 475 dbs_info->sample_type = DBS_NORMAL_SAMPLE;
469 queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); 476 INIT_DELAYED_WORK_NAR(&dbs_info->work, do_dbs_timer);
477 queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
478 delay);
470} 479}
471 480
472static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) 481static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
473{ 482{
474 dbs_info->enable = 0; 483 dbs_info->enable = 0;
475 cancel_delayed_work(&dbs_info->work); 484 cancel_delayed_work(&dbs_info->work);
476 flush_workqueue(kondemand_wq);
477} 485}
478 486
479static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 487static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
@@ -502,21 +510,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
502 510
503 mutex_lock(&dbs_mutex); 511 mutex_lock(&dbs_mutex);
504 dbs_enable++; 512 dbs_enable++;
505 if (dbs_enable == 1) {
506 kondemand_wq = create_workqueue("kondemand");
507 if (!kondemand_wq) {
508 printk(KERN_ERR
509 "Creation of kondemand failed\n");
510 dbs_enable--;
511 mutex_unlock(&dbs_mutex);
512 return -ENOSPC;
513 }
514 }
515 513
516 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group); 514 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
517 if (rc) { 515 if (rc) {
518 if (dbs_enable == 1)
519 destroy_workqueue(kondemand_wq);
520 dbs_enable--; 516 dbs_enable--;
521 mutex_unlock(&dbs_mutex); 517 mutex_unlock(&dbs_mutex);
522 return rc; 518 return rc;
@@ -530,7 +526,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
530 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j); 526 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j);
531 j_dbs_info->prev_cpu_wall = get_jiffies_64(); 527 j_dbs_info->prev_cpu_wall = get_jiffies_64();
532 } 528 }
533 this_dbs_info->enable = 1; 529 this_dbs_info->cpu = cpu;
534 /* 530 /*
535 * Start the timerschedule work, when this governor 531 * Start the timerschedule work, when this governor
536 * is used for first time 532 * is used for first time
@@ -550,7 +546,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
550 546
551 dbs_tuners_ins.sampling_rate = def_sampling_rate; 547 dbs_tuners_ins.sampling_rate = def_sampling_rate;
552 } 548 }
553 dbs_timer_init(policy->cpu); 549 dbs_timer_init(this_dbs_info);
554 550
555 mutex_unlock(&dbs_mutex); 551 mutex_unlock(&dbs_mutex);
556 break; 552 break;
@@ -560,9 +556,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
560 dbs_timer_exit(this_dbs_info); 556 dbs_timer_exit(this_dbs_info);
561 sysfs_remove_group(&policy->kobj, &dbs_attr_group); 557 sysfs_remove_group(&policy->kobj, &dbs_attr_group);
562 dbs_enable--; 558 dbs_enable--;
563 if (dbs_enable == 0)
564 destroy_workqueue(kondemand_wq);
565
566 mutex_unlock(&dbs_mutex); 559 mutex_unlock(&dbs_mutex);
567 560
568 break; 561 break;
@@ -591,12 +584,18 @@ static struct cpufreq_governor cpufreq_gov_dbs = {
591 584
592static int __init cpufreq_gov_dbs_init(void) 585static int __init cpufreq_gov_dbs_init(void)
593{ 586{
587 kondemand_wq = create_workqueue("kondemand");
588 if (!kondemand_wq) {
589 printk(KERN_ERR "Creation of kondemand failed\n");
590 return -EFAULT;
591 }
594 return cpufreq_register_governor(&cpufreq_gov_dbs); 592 return cpufreq_register_governor(&cpufreq_gov_dbs);
595} 593}
596 594
597static void __exit cpufreq_gov_dbs_exit(void) 595static void __exit cpufreq_gov_dbs_exit(void)
598{ 596{
599 cpufreq_unregister_governor(&cpufreq_gov_dbs); 597 cpufreq_unregister_governor(&cpufreq_gov_dbs);
598 destroy_workqueue(kondemand_wq);
600} 599}
601 600
602 601
@@ -608,3 +607,4 @@ MODULE_LICENSE("GPL");
608 607
609module_init(cpufreq_gov_dbs_init); 608module_init(cpufreq_gov_dbs_init);
610module_exit(cpufreq_gov_dbs_exit); 609module_exit(cpufreq_gov_dbs_exit);
610
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 91ad342a6051..d1c7cac9316c 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -370,12 +370,10 @@ __exit cpufreq_stats_exit(void)
370 cpufreq_unregister_notifier(&notifier_trans_block, 370 cpufreq_unregister_notifier(&notifier_trans_block,
371 CPUFREQ_TRANSITION_NOTIFIER); 371 CPUFREQ_TRANSITION_NOTIFIER);
372 unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier); 372 unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
373 lock_cpu_hotplug();
374 for_each_online_cpu(cpu) { 373 for_each_online_cpu(cpu) {
375 cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, 374 cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier,
376 CPU_DEAD, (void *)(long)cpu); 375 CPU_DEAD, (void *)(long)cpu);
377 } 376 }
378 unlock_cpu_hotplug();
379} 377}
380 378
381MODULE_AUTHOR ("Zou Nan hai <nanhai.zou@intel.com>"); 379MODULE_AUTHOR ("Zou Nan hai <nanhai.zou@intel.com>");
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 2a4eb0bfaf30..860345c7799a 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -71,7 +71,6 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy)
71 71
72 dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); 72 dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq);
73 73
74 lock_cpu_hotplug();
75 mutex_lock(&userspace_mutex); 74 mutex_lock(&userspace_mutex);
76 if (!cpu_is_managed[policy->cpu]) 75 if (!cpu_is_managed[policy->cpu])
77 goto err; 76 goto err;
@@ -94,7 +93,6 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy)
94 93
95 err: 94 err:
96 mutex_unlock(&userspace_mutex); 95 mutex_unlock(&userspace_mutex);
97 unlock_cpu_hotplug();
98 return ret; 96 return ret;
99} 97}
100 98
diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index cd251efda410..0a26e0663542 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -546,7 +546,7 @@ static void ads7846_rx(void *ads)
546 ts->spi->dev.bus_id, ts->tc.ignore, Rt); 546 ts->spi->dev.bus_id, ts->tc.ignore, Rt);
547#endif 547#endif
548 hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD), 548 hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD),
549 HRTIMER_REL); 549 HRTIMER_MODE_REL);
550 return; 550 return;
551 } 551 }
552 552
@@ -578,7 +578,8 @@ static void ads7846_rx(void *ads)
578#endif 578#endif
579 } 579 }
580 580
581 hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD), HRTIMER_REL); 581 hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD),
582 HRTIMER_MODE_REL);
582} 583}
583 584
584static int ads7846_debounce(void *ads, int data_idx, int *val) 585static int ads7846_debounce(void *ads, int data_idx, int *val)
@@ -667,7 +668,7 @@ static void ads7846_rx_val(void *ads)
667 status); 668 status);
668} 669}
669 670
670static int ads7846_timer(struct hrtimer *handle) 671static enum hrtimer_restart ads7846_timer(struct hrtimer *handle)
671{ 672{
672 struct ads7846 *ts = container_of(handle, struct ads7846, timer); 673 struct ads7846 *ts = container_of(handle, struct ads7846, timer);
673 int status = 0; 674 int status = 0;
@@ -724,7 +725,7 @@ static irqreturn_t ads7846_irq(int irq, void *handle)
724 disable_irq(ts->spi->irq); 725 disable_irq(ts->spi->irq);
725 ts->pending = 1; 726 ts->pending = 1;
726 hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_DELAY), 727 hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_DELAY),
727 HRTIMER_REL); 728 HRTIMER_MODE_REL);
728 } 729 }
729 } 730 }
730 spin_unlock_irqrestore(&ts->lock, flags); 731 spin_unlock_irqrestore(&ts->lock, flags);
@@ -862,7 +863,7 @@ static int __devinit ads7846_probe(struct spi_device *spi)
862 ts->spi = spi; 863 ts->spi = spi;
863 ts->input = input_dev; 864 ts->input = input_dev;
864 865
865 hrtimer_init(&ts->timer, CLOCK_MONOTONIC, HRTIMER_REL); 866 hrtimer_init(&ts->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
866 ts->timer.function = ads7846_timer; 867 ts->timer.function = ads7846_timer;
867 868
868 spin_lock_init(&ts->lock); 869 spin_lock_init(&ts->lock);
diff --git a/drivers/isdn/gigaset/Makefile b/drivers/isdn/gigaset/Makefile
index 835b806a9de7..077e297d8c72 100644
--- a/drivers/isdn/gigaset/Makefile
+++ b/drivers/isdn/gigaset/Makefile
@@ -5,4 +5,4 @@ ser_gigaset-y := ser-gigaset.o asyncdata.o
5 5
6obj-$(CONFIG_GIGASET_M105) += usb_gigaset.o gigaset.o 6obj-$(CONFIG_GIGASET_M105) += usb_gigaset.o gigaset.o
7obj-$(CONFIG_GIGASET_BASE) += bas_gigaset.o gigaset.o 7obj-$(CONFIG_GIGASET_BASE) += bas_gigaset.o gigaset.o
8obj-$(CONFIG_GIGASET_M105) += ser_gigaset.o gigaset.o 8obj-$(CONFIG_GIGASET_M101) += ser_gigaset.o gigaset.o
diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c
index ccef56d0c157..ed3426062a8b 100644
--- a/drivers/video/s3c2410fb.c
+++ b/drivers/video/s3c2410fb.c
@@ -791,6 +791,8 @@ static int __init s3c2410fb_probe(struct platform_device *pdev)
791 791
792 info = fbinfo->par; 792 info = fbinfo->par;
793 info->fb = fbinfo; 793 info->fb = fbinfo;
794 info->dev = &pdev->dev;
795
794 platform_set_drvdata(pdev, fbinfo); 796 platform_set_drvdata(pdev, fbinfo);
795 797
796 dprintk("devinit\n"); 798 dprintk("devinit\n");
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b3609b7cdf11..403e3bad1455 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -467,6 +467,7 @@ extern struct kmem_cache *ecryptfs_header_cache_1;
467extern struct kmem_cache *ecryptfs_header_cache_2; 467extern struct kmem_cache *ecryptfs_header_cache_2;
468extern struct kmem_cache *ecryptfs_xattr_cache; 468extern struct kmem_cache *ecryptfs_xattr_cache;
469extern struct kmem_cache *ecryptfs_lower_page_cache; 469extern struct kmem_cache *ecryptfs_lower_page_cache;
470extern struct kmem_cache *ecryptfs_key_record_cache;
470 471
471int ecryptfs_interpose(struct dentry *hidden_dentry, 472int ecryptfs_interpose(struct dentry *hidden_dentry,
472 struct dentry *this_dentry, struct super_block *sb, 473 struct dentry *this_dentry, struct super_block *sb,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 81156e95ef8e..b550dea8eee6 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1638,6 +1638,8 @@ out:
1638 return rc; 1638 return rc;
1639} 1639}
1640 1640
1641struct kmem_cache *ecryptfs_key_record_cache;
1642
1641/** 1643/**
1642 * ecryptfs_generate_key_packet_set 1644 * ecryptfs_generate_key_packet_set
1643 * @dest: Virtual address from which to write the key record set 1645 * @dest: Virtual address from which to write the key record set
@@ -1664,50 +1666,55 @@ ecryptfs_generate_key_packet_set(char *dest_base,
1664 &ecryptfs_superblock_to_private( 1666 &ecryptfs_superblock_to_private(
1665 ecryptfs_dentry->d_sb)->mount_crypt_stat; 1667 ecryptfs_dentry->d_sb)->mount_crypt_stat;
1666 size_t written; 1668 size_t written;
1667 struct ecryptfs_key_record key_rec; 1669 struct ecryptfs_key_record *key_rec;
1668 int rc = 0; 1670 int rc = 0;
1669 1671
1670 (*len) = 0; 1672 (*len) = 0;
1673 key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL);
1674 if (!key_rec) {
1675 rc = -ENOMEM;
1676 goto out;
1677 }
1671 if (mount_crypt_stat->global_auth_tok) { 1678 if (mount_crypt_stat->global_auth_tok) {
1672 auth_tok = mount_crypt_stat->global_auth_tok; 1679 auth_tok = mount_crypt_stat->global_auth_tok;
1673 if (auth_tok->token_type == ECRYPTFS_PASSWORD) { 1680 if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
1674 rc = write_tag_3_packet((dest_base + (*len)), 1681 rc = write_tag_3_packet((dest_base + (*len)),
1675 max, auth_tok, 1682 max, auth_tok,
1676 crypt_stat, &key_rec, 1683 crypt_stat, key_rec,
1677 &written); 1684 &written);
1678 if (rc) { 1685 if (rc) {
1679 ecryptfs_printk(KERN_WARNING, "Error " 1686 ecryptfs_printk(KERN_WARNING, "Error "
1680 "writing tag 3 packet\n"); 1687 "writing tag 3 packet\n");
1681 goto out; 1688 goto out_free;
1682 } 1689 }
1683 (*len) += written; 1690 (*len) += written;
1684 /* Write auth tok signature packet */ 1691 /* Write auth tok signature packet */
1685 rc = write_tag_11_packet( 1692 rc = write_tag_11_packet(
1686 (dest_base + (*len)), 1693 (dest_base + (*len)),
1687 (max - (*len)), 1694 (max - (*len)),
1688 key_rec.sig, ECRYPTFS_SIG_SIZE, &written); 1695 key_rec->sig, ECRYPTFS_SIG_SIZE, &written);
1689 if (rc) { 1696 if (rc) {
1690 ecryptfs_printk(KERN_ERR, "Error writing " 1697 ecryptfs_printk(KERN_ERR, "Error writing "
1691 "auth tok signature packet\n"); 1698 "auth tok signature packet\n");
1692 goto out; 1699 goto out_free;
1693 } 1700 }
1694 (*len) += written; 1701 (*len) += written;
1695 } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { 1702 } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
1696 rc = write_tag_1_packet(dest_base + (*len), 1703 rc = write_tag_1_packet(dest_base + (*len),
1697 max, auth_tok, 1704 max, auth_tok,
1698 crypt_stat,mount_crypt_stat, 1705 crypt_stat,mount_crypt_stat,
1699 &key_rec, &written); 1706 key_rec, &written);
1700 if (rc) { 1707 if (rc) {
1701 ecryptfs_printk(KERN_WARNING, "Error " 1708 ecryptfs_printk(KERN_WARNING, "Error "
1702 "writing tag 1 packet\n"); 1709 "writing tag 1 packet\n");
1703 goto out; 1710 goto out_free;
1704 } 1711 }
1705 (*len) += written; 1712 (*len) += written;
1706 } else { 1713 } else {
1707 ecryptfs_printk(KERN_WARNING, "Unsupported " 1714 ecryptfs_printk(KERN_WARNING, "Unsupported "
1708 "authentication token type\n"); 1715 "authentication token type\n");
1709 rc = -EINVAL; 1716 rc = -EINVAL;
1710 goto out; 1717 goto out_free;
1711 } 1718 }
1712 } else 1719 } else
1713 BUG(); 1720 BUG();
@@ -1717,6 +1724,9 @@ ecryptfs_generate_key_packet_set(char *dest_base,
1717 ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n"); 1724 ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
1718 rc = -EIO; 1725 rc = -EIO;
1719 } 1726 }
1727
1728out_free:
1729 kmem_cache_free(ecryptfs_key_record_cache, key_rec);
1720out: 1730out:
1721 if (rc) 1731 if (rc)
1722 (*len) = 0; 1732 (*len) = 0;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 26fe405a5763..80044d196fe0 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -651,6 +651,11 @@ static struct ecryptfs_cache_info {
651 .name = "ecryptfs_lower_page_cache", 651 .name = "ecryptfs_lower_page_cache",
652 .size = PAGE_CACHE_SIZE, 652 .size = PAGE_CACHE_SIZE,
653 }, 653 },
654 {
655 .cache = &ecryptfs_key_record_cache,
656 .name = "ecryptfs_key_record_cache",
657 .size = sizeof(struct ecryptfs_key_record),
658 },
654}; 659};
655 660
656static void ecryptfs_free_kmem_caches(void) 661static void ecryptfs_free_kmem_caches(void)
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 47d7e7b611f7..3baf253be95a 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -169,7 +169,8 @@ int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid)
169 if (!new_id) { 169 if (!new_id) {
170 rc = -ENOMEM; 170 rc = -ENOMEM;
171 ecryptfs_printk(KERN_ERR, "Failed to allocate memory; unable " 171 ecryptfs_printk(KERN_ERR, "Failed to allocate memory; unable "
172 "to register daemon [%d] for user\n", pid, uid); 172 "to register daemon [%d] for user [%d]\n",
173 pid, uid);
173 goto unlock; 174 goto unlock;
174 } 175 }
175 if (!ecryptfs_find_daemon_id(uid, &old_id)) { 176 if (!ecryptfs_find_daemon_id(uid, &old_id)) {
diff --git a/fs/namei.c b/fs/namei.c
index 161e2225c757..ee60cc4d3453 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2688,10 +2688,11 @@ int __page_symlink(struct inode *inode, const char *symname, int len,
2688{ 2688{
2689 struct address_space *mapping = inode->i_mapping; 2689 struct address_space *mapping = inode->i_mapping;
2690 struct page *page; 2690 struct page *page;
2691 int err = -ENOMEM; 2691 int err;
2692 char *kaddr; 2692 char *kaddr;
2693 2693
2694retry: 2694retry:
2695 err = -ENOMEM;
2695 page = find_or_create_page(mapping, 0, gfp_mask); 2696 page = find_or_create_page(mapping, 0, gfp_mask);
2696 if (!page) 2697 if (!page)
2697 goto fail; 2698 goto fail;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 5d94555cdc83..832673b14587 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -61,9 +61,11 @@
61 61
62/* flags used to simulate posix default ACLs */ 62/* flags used to simulate posix default ACLs */
63#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ 63#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
64 | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE) 64 | NFS4_ACE_DIRECTORY_INHERIT_ACE)
65 65
66#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP) 66#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \
67 | NFS4_ACE_INHERIT_ONLY_ACE \
68 | NFS4_ACE_IDENTIFIER_GROUP)
67 69
68#define MASK_EQUAL(mask1, mask2) \ 70#define MASK_EQUAL(mask1, mask2) \
69 ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) 71 ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
@@ -87,12 +89,19 @@ mask_from_posix(unsigned short perm, unsigned int flags)
87} 89}
88 90
89static u32 91static u32
90deny_mask(u32 allow_mask, unsigned int flags) 92deny_mask_from_posix(unsigned short perm, u32 flags)
91{ 93{
92 u32 ret = ~allow_mask & ~NFS4_MASK_UNSUPP; 94 u32 mask = 0;
93 if (!(flags & NFS4_ACL_DIR)) 95
94 ret &= ~NFS4_ACE_DELETE_CHILD; 96 if (perm & ACL_READ)
95 return ret; 97 mask |= NFS4_READ_MODE;
98 if (perm & ACL_WRITE)
99 mask |= NFS4_WRITE_MODE;
100 if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
101 mask |= NFS4_ACE_DELETE_CHILD;
102 if (perm & ACL_EXECUTE)
103 mask |= NFS4_EXECUTE_MODE;
104 return mask;
96} 105}
97 106
98/* XXX: modify functions to return NFS errors; they're only ever 107/* XXX: modify functions to return NFS errors; they're only ever
@@ -126,108 +135,151 @@ struct ace_container {
126}; 135};
127 136
128static short ace2type(struct nfs4_ace *); 137static short ace2type(struct nfs4_ace *);
129static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); 138static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
130static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int); 139 unsigned int);
131int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); 140void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
132static int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *);
133 141
134struct nfs4_acl * 142struct nfs4_acl *
135nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, 143nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
136 unsigned int flags) 144 unsigned int flags)
137{ 145{
138 struct nfs4_acl *acl; 146 struct nfs4_acl *acl;
139 int error = -EINVAL; 147 int size = 0;
140 148
141 if ((pacl != NULL && 149 if (pacl) {
142 (posix_acl_valid(pacl) < 0 || pacl->a_count == 0)) || 150 if (posix_acl_valid(pacl) < 0)
143 (dpacl != NULL && 151 return ERR_PTR(-EINVAL);
144 (posix_acl_valid(dpacl) < 0 || dpacl->a_count == 0))) 152 size += 2*pacl->a_count;
145 goto out_err;
146
147 acl = nfs4_acl_new();
148 if (acl == NULL) {
149 error = -ENOMEM;
150 goto out_err;
151 } 153 }
152 154 if (dpacl) {
153 if (pacl != NULL) { 155 if (posix_acl_valid(dpacl) < 0)
154 error = _posix_to_nfsv4_one(pacl, acl, 156 return ERR_PTR(-EINVAL);
155 flags & ~NFS4_ACL_TYPE_DEFAULT); 157 size += 2*dpacl->a_count;
156 if (error < 0)
157 goto out_acl;
158 } 158 }
159 159
160 if (dpacl != NULL) { 160 /* Allocate for worst case: one (deny, allow) pair each: */
161 error = _posix_to_nfsv4_one(dpacl, acl, 161 acl = nfs4_acl_new(size);
162 flags | NFS4_ACL_TYPE_DEFAULT); 162 if (acl == NULL)
163 if (error < 0) 163 return ERR_PTR(-ENOMEM);
164 goto out_acl;
165 }
166 164
167 return acl; 165 if (pacl)
166 _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
168 167
169out_acl: 168 if (dpacl)
170 nfs4_acl_free(acl); 169 _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT);
171out_err:
172 acl = ERR_PTR(error);
173 170
174 return acl; 171 return acl;
175} 172}
176 173
177static int 174struct posix_acl_summary {
178nfs4_acl_add_pair(struct nfs4_acl *acl, int eflag, u32 mask, int whotype, 175 unsigned short owner;
179 uid_t owner, unsigned int flags) 176 unsigned short users;
177 unsigned short group;
178 unsigned short groups;
179 unsigned short other;
180 unsigned short mask;
181};
182
183static void
184summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
180{ 185{
181 int error; 186 struct posix_acl_entry *pa, *pe;
182 187 pas->users = 0;
183 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, 188 pas->groups = 0;
184 eflag, mask, whotype, owner); 189 pas->mask = 07;
185 if (error < 0) 190
186 return error; 191 pe = acl->a_entries + acl->a_count;
187 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, 192
188 eflag, deny_mask(mask, flags), whotype, owner); 193 FOREACH_ACL_ENTRY(pa, acl, pe) {
189 return error; 194 switch (pa->e_tag) {
195 case ACL_USER_OBJ:
196 pas->owner = pa->e_perm;
197 break;
198 case ACL_GROUP_OBJ:
199 pas->group = pa->e_perm;
200 break;
201 case ACL_USER:
202 pas->users |= pa->e_perm;
203 break;
204 case ACL_GROUP:
205 pas->groups |= pa->e_perm;
206 break;
207 case ACL_OTHER:
208 pas->other = pa->e_perm;
209 break;
210 case ACL_MASK:
211 pas->mask = pa->e_perm;
212 break;
213 }
214 }
215 /* We'll only care about effective permissions: */
216 pas->users &= pas->mask;
217 pas->group &= pas->mask;
218 pas->groups &= pas->mask;
190} 219}
191 220
192/* We assume the acl has been verified with posix_acl_valid. */ 221/* We assume the acl has been verified with posix_acl_valid. */
193static int 222static void
194_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, 223_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
195 unsigned int flags) 224 unsigned int flags)
196{ 225{
197 struct posix_acl_entry *pa, *pe, *group_owner_entry; 226 struct posix_acl_entry *pa, *group_owner_entry;
198 int error = -EINVAL; 227 struct nfs4_ace *ace;
199 u32 mask, mask_mask; 228 struct posix_acl_summary pas;
229 unsigned short deny;
200 int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? 230 int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
201 NFS4_INHERITANCE_FLAGS : 0); 231 NFS4_INHERITANCE_FLAGS : 0);
202 232
203 BUG_ON(pacl->a_count < 3); 233 BUG_ON(pacl->a_count < 3);
204 pe = pacl->a_entries + pacl->a_count; 234 summarize_posix_acl(pacl, &pas);
205 pa = pe - 2; /* if mask entry exists, it's second from the last. */
206 if (pa->e_tag == ACL_MASK)
207 mask_mask = deny_mask(mask_from_posix(pa->e_perm, flags), flags);
208 else
209 mask_mask = 0;
210 235
211 pa = pacl->a_entries; 236 pa = pacl->a_entries;
212 BUG_ON(pa->e_tag != ACL_USER_OBJ); 237 ace = acl->aces + acl->naces;
213 mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
214 error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_OWNER, 0, flags);
215 if (error < 0)
216 goto out;
217 pa++;
218 238
219 while (pa->e_tag == ACL_USER) { 239 /* We could deny everything not granted by the owner: */
220 mask = mask_from_posix(pa->e_perm, flags); 240 deny = ~pas.owner;
221 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, 241 /*
222 eflag, mask_mask, NFS4_ACL_WHO_NAMED, pa->e_id); 242 * but it is equivalent (and simpler) to deny only what is not
223 if (error < 0) 243 * granted by later entries:
224 goto out; 244 */
245 deny &= pas.users | pas.group | pas.groups | pas.other;
246 if (deny) {
247 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
248 ace->flag = eflag;
249 ace->access_mask = deny_mask_from_posix(deny, flags);
250 ace->whotype = NFS4_ACL_WHO_OWNER;
251 ace++;
252 acl->naces++;
253 }
225 254
255 ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
256 ace->flag = eflag;
257 ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
258 ace->whotype = NFS4_ACL_WHO_OWNER;
259 ace++;
260 acl->naces++;
261 pa++;
226 262
227 error = nfs4_acl_add_pair(acl, eflag, mask, 263 while (pa->e_tag == ACL_USER) {
228 NFS4_ACL_WHO_NAMED, pa->e_id, flags); 264 deny = ~(pa->e_perm & pas.mask);
229 if (error < 0) 265 deny &= pas.groups | pas.group | pas.other;
230 goto out; 266 if (deny) {
267 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
268 ace->flag = eflag;
269 ace->access_mask = deny_mask_from_posix(deny, flags);
270 ace->whotype = NFS4_ACL_WHO_NAMED;
271 ace->who = pa->e_id;
272 ace++;
273 acl->naces++;
274 }
275 ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
276 ace->flag = eflag;
277 ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
278 flags);
279 ace->whotype = NFS4_ACL_WHO_NAMED;
280 ace->who = pa->e_id;
281 ace++;
282 acl->naces++;
231 pa++; 283 pa++;
232 } 284 }
233 285
@@ -236,67 +288,65 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
236 288
237 /* allow ACEs */ 289 /* allow ACEs */
238 290
239 if (pacl->a_count > 3) {
240 BUG_ON(pa->e_tag != ACL_GROUP_OBJ);
241 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
242 NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask,
243 NFS4_ACL_WHO_GROUP, 0);
244 if (error < 0)
245 goto out;
246 }
247 group_owner_entry = pa; 291 group_owner_entry = pa;
248 mask = mask_from_posix(pa->e_perm, flags); 292
249 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, 293 ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
250 NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, 294 ace->flag = eflag;
251 NFS4_ACL_WHO_GROUP, 0); 295 ace->access_mask = mask_from_posix(pas.group, flags);
252 if (error < 0) 296 ace->whotype = NFS4_ACL_WHO_GROUP;
253 goto out; 297 ace++;
298 acl->naces++;
254 pa++; 299 pa++;
255 300
256 while (pa->e_tag == ACL_GROUP) { 301 while (pa->e_tag == ACL_GROUP) {
257 mask = mask_from_posix(pa->e_perm, flags); 302 ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
258 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, 303 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
259 NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, 304 ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
260 NFS4_ACL_WHO_NAMED, pa->e_id); 305 flags);
261 if (error < 0) 306 ace->whotype = NFS4_ACL_WHO_NAMED;
262 goto out; 307 ace->who = pa->e_id;
263 308 ace++;
264 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, 309 acl->naces++;
265 NFS4_ACE_IDENTIFIER_GROUP | eflag, mask,
266 NFS4_ACL_WHO_NAMED, pa->e_id);
267 if (error < 0)
268 goto out;
269 pa++; 310 pa++;
270 } 311 }
271 312
272 /* deny ACEs */ 313 /* deny ACEs */
273 314
274 pa = group_owner_entry; 315 pa = group_owner_entry;
275 mask = mask_from_posix(pa->e_perm, flags); 316
276 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, 317 deny = ~pas.group & pas.other;
277 NFS4_ACE_IDENTIFIER_GROUP | eflag, 318 if (deny) {
278 deny_mask(mask, flags), NFS4_ACL_WHO_GROUP, 0); 319 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
279 if (error < 0) 320 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
280 goto out; 321 ace->access_mask = deny_mask_from_posix(deny, flags);
322 ace->whotype = NFS4_ACL_WHO_GROUP;
323 ace++;
324 acl->naces++;
325 }
281 pa++; 326 pa++;
327
282 while (pa->e_tag == ACL_GROUP) { 328 while (pa->e_tag == ACL_GROUP) {
283 mask = mask_from_posix(pa->e_perm, flags); 329 deny = ~(pa->e_perm & pas.mask);
284 error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, 330 deny &= pas.other;
285 NFS4_ACE_IDENTIFIER_GROUP | eflag, 331 if (deny) {
286 deny_mask(mask, flags), NFS4_ACL_WHO_NAMED, pa->e_id); 332 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
287 if (error < 0) 333 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
288 goto out; 334 ace->access_mask = mask_from_posix(deny, flags);
335 ace->whotype = NFS4_ACL_WHO_NAMED;
336 ace->who = pa->e_id;
337 ace++;
338 acl->naces++;
339 }
289 pa++; 340 pa++;
290 } 341 }
291 342
292 if (pa->e_tag == ACL_MASK) 343 if (pa->e_tag == ACL_MASK)
293 pa++; 344 pa++;
294 BUG_ON(pa->e_tag != ACL_OTHER); 345 ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
295 mask = mask_from_posix(pa->e_perm, flags); 346 ace->flag = eflag;
296 error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_EVERYONE, 0, flags); 347 ace->access_mask = mask_from_posix(pa->e_perm, flags);
297 348 ace->whotype = NFS4_ACL_WHO_EVERYONE;
298out: 349 acl->naces++;
299 return error;
300} 350}
301 351
302static void 352static void
@@ -342,46 +392,6 @@ sort_pacl(struct posix_acl *pacl)
342 return; 392 return;
343} 393}
344 394
345int
346nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
347 struct posix_acl **dpacl, unsigned int flags)
348{
349 struct nfs4_acl *dacl;
350 int error = -ENOMEM;
351
352 *pacl = NULL;
353 *dpacl = NULL;
354
355 dacl = nfs4_acl_new();
356 if (dacl == NULL)
357 goto out;
358
359 error = nfs4_acl_split(acl, dacl);
360 if (error)
361 goto out_acl;
362
363 *pacl = _nfsv4_to_posix_one(acl, flags);
364 if (IS_ERR(*pacl)) {
365 error = PTR_ERR(*pacl);
366 *pacl = NULL;
367 goto out_acl;
368 }
369
370 *dpacl = _nfsv4_to_posix_one(dacl, flags);
371 if (IS_ERR(*dpacl)) {
372 error = PTR_ERR(*dpacl);
373 *dpacl = NULL;
374 }
375out_acl:
376 if (error) {
377 posix_acl_release(*pacl);
378 *pacl = NULL;
379 }
380 nfs4_acl_free(dacl);
381out:
382 return error;
383}
384
385/* 395/*
386 * While processing the NFSv4 ACE, this maintains bitmasks representing 396 * While processing the NFSv4 ACE, this maintains bitmasks representing
387 * which permission bits have been allowed and which denied to a given 397 * which permission bits have been allowed and which denied to a given
@@ -406,6 +416,7 @@ struct posix_ace_state_array {
406 * calculated so far: */ 416 * calculated so far: */
407 417
408struct posix_acl_state { 418struct posix_acl_state {
419 int empty;
409 struct posix_ace_state owner; 420 struct posix_ace_state owner;
410 struct posix_ace_state group; 421 struct posix_ace_state group;
411 struct posix_ace_state other; 422 struct posix_ace_state other;
@@ -421,6 +432,7 @@ init_state(struct posix_acl_state *state, int cnt)
421 int alloc; 432 int alloc;
422 433
423 memset(state, 0, sizeof(struct posix_acl_state)); 434 memset(state, 0, sizeof(struct posix_acl_state));
435 state->empty = 1;
424 /* 436 /*
425 * In the worst case, each individual acl could be for a distinct 437 * In the worst case, each individual acl could be for a distinct
426 * named user or group, but we don't no which, so we allocate 438 * named user or group, but we don't no which, so we allocate
@@ -488,6 +500,20 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
488 int nace; 500 int nace;
489 int i, error = 0; 501 int i, error = 0;
490 502
503 /*
504 * ACLs with no ACEs are treated differently in the inheritable
505 * and effective cases: when there are no inheritable ACEs, we
506 * set a zero-length default posix acl:
507 */
508 if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
509 pacl = posix_acl_alloc(0, GFP_KERNEL);
510 return pacl ? pacl : ERR_PTR(-ENOMEM);
511 }
512 /*
513 * When there are no effective ACEs, the following will end
514 * up setting a 3-element effective posix ACL with all
515 * permissions zero.
516 */
491 nace = 4 + state->users->n + state->groups->n; 517 nace = 4 + state->users->n + state->groups->n;
492 pacl = posix_acl_alloc(nace, GFP_KERNEL); 518 pacl = posix_acl_alloc(nace, GFP_KERNEL);
493 if (!pacl) 519 if (!pacl)
@@ -603,6 +629,8 @@ static void process_one_v4_ace(struct posix_acl_state *state,
603 u32 mask = ace->access_mask; 629 u32 mask = ace->access_mask;
604 int i; 630 int i;
605 631
632 state->empty = 0;
633
606 switch (ace2type(ace)) { 634 switch (ace2type(ace)) {
607 case ACL_USER_OBJ: 635 case ACL_USER_OBJ:
608 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { 636 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
@@ -666,75 +694,62 @@ static void process_one_v4_ace(struct posix_acl_state *state,
666 } 694 }
667} 695}
668 696
669static struct posix_acl * 697int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
670_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags) 698 struct posix_acl **dpacl, unsigned int flags)
671{ 699{
672 struct posix_acl_state state; 700 struct posix_acl_state effective_acl_state, default_acl_state;
673 struct posix_acl *pacl;
674 struct nfs4_ace *ace; 701 struct nfs4_ace *ace;
675 int ret; 702 int ret;
676 703
677 ret = init_state(&state, n4acl->naces); 704 ret = init_state(&effective_acl_state, acl->naces);
678 if (ret) 705 if (ret)
679 return ERR_PTR(ret); 706 return ret;
680 707 ret = init_state(&default_acl_state, acl->naces);
681 list_for_each_entry(ace, &n4acl->ace_head, l_ace) 708 if (ret)
682 process_one_v4_ace(&state, ace); 709 goto out_estate;
683 710 ret = -EINVAL;
684 pacl = posix_state_to_acl(&state, flags); 711 for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
685
686 free_state(&state);
687
688 if (!IS_ERR(pacl))
689 sort_pacl(pacl);
690 return pacl;
691}
692
693static int
694nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
695{
696 struct list_head *h, *n;
697 struct nfs4_ace *ace;
698 int error = 0;
699
700 list_for_each_safe(h, n, &acl->ace_head) {
701 ace = list_entry(h, struct nfs4_ace, l_ace);
702
703 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && 712 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
704 ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) 713 ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
705 return -EINVAL; 714 goto out_dstate;
706
707 if (ace->flag & ~NFS4_SUPPORTED_FLAGS) 715 if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
708 return -EINVAL; 716 goto out_dstate;
709 717 if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) {
710 switch (ace->flag & NFS4_INHERITANCE_FLAGS) { 718 process_one_v4_ace(&effective_acl_state, ace);
711 case 0:
712 /* Leave this ace in the effective acl: */
713 continue; 719 continue;
714 case NFS4_INHERITANCE_FLAGS:
715 /* Add this ace to the default acl and remove it
716 * from the effective acl: */
717 error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
718 ace->access_mask, ace->whotype, ace->who);
719 if (error)
720 return error;
721 list_del(h);
722 kfree(ace);
723 acl->naces--;
724 break;
725 case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE:
726 /* Add this ace to the default, but leave it in
727 * the effective acl as well: */
728 error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
729 ace->access_mask, ace->whotype, ace->who);
730 if (error)
731 return error;
732 break;
733 default:
734 return -EINVAL;
735 } 720 }
721 if (!(flags & NFS4_ACL_DIR))
722 goto out_dstate;
723 /*
724 * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT
725 * is set, we're effectively turning on the other. That's OK,
726 * according to rfc 3530.
727 */
728 process_one_v4_ace(&default_acl_state, ace);
729
730 if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE))
731 process_one_v4_ace(&effective_acl_state, ace);
736 } 732 }
737 return 0; 733 *pacl = posix_state_to_acl(&effective_acl_state, flags);
734 if (IS_ERR(*pacl)) {
735 ret = PTR_ERR(*pacl);
736 goto out_dstate;
737 }
738 *dpacl = posix_state_to_acl(&default_acl_state,
739 flags | NFS4_ACL_TYPE_DEFAULT);
740 if (IS_ERR(*dpacl)) {
741 ret = PTR_ERR(*dpacl);
742 posix_acl_release(*pacl);
743 goto out_dstate;
744 }
745 sort_pacl(*pacl);
746 sort_pacl(*dpacl);
747 ret = 0;
748out_dstate:
749 free_state(&default_acl_state);
750out_estate:
751 free_state(&effective_acl_state);
752 return ret;
738} 753}
739 754
740static short 755static short
@@ -759,48 +774,22 @@ EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
759EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); 774EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
760 775
761struct nfs4_acl * 776struct nfs4_acl *
762nfs4_acl_new(void) 777nfs4_acl_new(int n)
763{ 778{
764 struct nfs4_acl *acl; 779 struct nfs4_acl *acl;
765 780
766 if ((acl = kmalloc(sizeof(*acl), GFP_KERNEL)) == NULL) 781 acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL);
782 if (acl == NULL)
767 return NULL; 783 return NULL;
768
769 acl->naces = 0; 784 acl->naces = 0;
770 INIT_LIST_HEAD(&acl->ace_head);
771
772 return acl; 785 return acl;
773} 786}
774 787
775void 788void
776nfs4_acl_free(struct nfs4_acl *acl)
777{
778 struct list_head *h;
779 struct nfs4_ace *ace;
780
781 if (!acl)
782 return;
783
784 while (!list_empty(&acl->ace_head)) {
785 h = acl->ace_head.next;
786 list_del(h);
787 ace = list_entry(h, struct nfs4_ace, l_ace);
788 kfree(ace);
789 }
790
791 kfree(acl);
792
793 return;
794}
795
796int
797nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, 789nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
798 int whotype, uid_t who) 790 int whotype, uid_t who)
799{ 791{
800 struct nfs4_ace *ace; 792 struct nfs4_ace *ace = acl->aces + acl->naces;
801
802 if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
803 return -ENOMEM;
804 793
805 ace->type = type; 794 ace->type = type;
806 ace->flag = flag; 795 ace->flag = flag;
@@ -808,10 +797,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
808 ace->whotype = whotype; 797 ace->whotype = whotype;
809 ace->who = who; 798 ace->who = who;
810 799
811 list_add_tail(&ace->l_ace, &acl->ace_head);
812 acl->naces++; 800 acl->naces++;
813
814 return 0;
815} 801}
816 802
817static struct { 803static struct {
@@ -865,7 +851,6 @@ nfs4_acl_write_who(int who, char *p)
865} 851}
866 852
867EXPORT_SYMBOL(nfs4_acl_new); 853EXPORT_SYMBOL(nfs4_acl_new);
868EXPORT_SYMBOL(nfs4_acl_free);
869EXPORT_SYMBOL(nfs4_acl_add_ace); 854EXPORT_SYMBOL(nfs4_acl_add_ace);
870EXPORT_SYMBOL(nfs4_acl_get_whotype); 855EXPORT_SYMBOL(nfs4_acl_get_whotype);
871EXPORT_SYMBOL(nfs4_acl_write_who); 856EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f57655a7a2b6..fb14d68eacab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -387,7 +387,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
387 .address = (struct sockaddr *)&addr, 387 .address = (struct sockaddr *)&addr,
388 .addrsize = sizeof(addr), 388 .addrsize = sizeof(addr),
389 .timeout = &timeparms, 389 .timeout = &timeparms,
390 .servername = clp->cl_name.data,
391 .program = program, 390 .program = program,
392 .version = nfs_cb_version[1]->number, 391 .version = nfs_cb_version[1]->number,
393 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 392 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
@@ -397,6 +396,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
397 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 396 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
398 .rpc_argp = clp, 397 .rpc_argp = clp,
399 }; 398 };
399 char clientname[16];
400 int status; 400 int status;
401 401
402 if (atomic_read(&cb->cb_set)) 402 if (atomic_read(&cb->cb_set))
@@ -419,6 +419,11 @@ nfsd4_probe_callback(struct nfs4_client *clp)
419 memset(program->stats, 0, sizeof(cb->cb_stat)); 419 memset(program->stats, 0, sizeof(cb->cb_stat));
420 program->stats->program = program; 420 program->stats->program = program;
421 421
422 /* Just here to make some printk's more useful: */
423 snprintf(clientname, sizeof(clientname),
424 "%u.%u.%u.%u", NIPQUAD(addr.sin_addr));
425 args.servername = clientname;
426
422 /* Create RPC client */ 427 /* Create RPC client */
423 cb->cb_client = rpc_create(&args); 428 cb->cb_client = rpc_create(&args);
424 if (IS_ERR(cb->cb_client)) { 429 if (IS_ERR(cb->cb_client)) {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0efba557fb55..5d090f11f2be 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -199,24 +199,22 @@ defer_free(struct nfsd4_compoundargs *argp,
199 199
200static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) 200static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
201{ 201{
202 void *new = NULL;
203 if (p == argp->tmp) { 202 if (p == argp->tmp) {
204 new = kmalloc(nbytes, GFP_KERNEL); 203 p = kmalloc(nbytes, GFP_KERNEL);
205 if (!new) return NULL; 204 if (!p)
206 p = new; 205 return NULL;
207 memcpy(p, argp->tmp, nbytes); 206 memcpy(p, argp->tmp, nbytes);
208 } else { 207 } else {
209 BUG_ON(p != argp->tmpp); 208 BUG_ON(p != argp->tmpp);
210 argp->tmpp = NULL; 209 argp->tmpp = NULL;
211 } 210 }
212 if (defer_free(argp, kfree, p)) { 211 if (defer_free(argp, kfree, p)) {
213 kfree(new); 212 kfree(p);
214 return NULL; 213 return NULL;
215 } else 214 } else
216 return (char *)p; 215 return (char *)p;
217} 216}
218 217
219
220static __be32 218static __be32
221nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) 219nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
222{ 220{
@@ -255,7 +253,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
255 return status; 253 return status;
256 254
257 /* 255 /*
258 * According to spec, unsupported attributes return ERR_NOTSUPP; 256 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
259 * read-only attributes return ERR_INVAL. 257 * read-only attributes return ERR_INVAL.
260 */ 258 */
261 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 259 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
@@ -273,42 +271,42 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
273 iattr->ia_valid |= ATTR_SIZE; 271 iattr->ia_valid |= ATTR_SIZE;
274 } 272 }
275 if (bmval[0] & FATTR4_WORD0_ACL) { 273 if (bmval[0] & FATTR4_WORD0_ACL) {
276 int nace, i; 274 int nace;
277 struct nfs4_ace ace; 275 struct nfs4_ace *ace;
278 276
279 READ_BUF(4); len += 4; 277 READ_BUF(4); len += 4;
280 READ32(nace); 278 READ32(nace);
281 279
282 *acl = nfs4_acl_new(); 280 if (nace > NFS4_ACL_MAX)
281 return nfserr_resource;
282
283 *acl = nfs4_acl_new(nace);
283 if (*acl == NULL) { 284 if (*acl == NULL) {
284 host_err = -ENOMEM; 285 host_err = -ENOMEM;
285 goto out_nfserr; 286 goto out_nfserr;
286 } 287 }
287 defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl); 288 defer_free(argp, kfree, *acl);
288 289
289 for (i = 0; i < nace; i++) { 290 (*acl)->naces = nace;
291 for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
290 READ_BUF(16); len += 16; 292 READ_BUF(16); len += 16;
291 READ32(ace.type); 293 READ32(ace->type);
292 READ32(ace.flag); 294 READ32(ace->flag);
293 READ32(ace.access_mask); 295 READ32(ace->access_mask);
294 READ32(dummy32); 296 READ32(dummy32);
295 READ_BUF(dummy32); 297 READ_BUF(dummy32);
296 len += XDR_QUADLEN(dummy32) << 2; 298 len += XDR_QUADLEN(dummy32) << 2;
297 READMEM(buf, dummy32); 299 READMEM(buf, dummy32);
298 ace.whotype = nfs4_acl_get_whotype(buf, dummy32); 300 ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
299 host_err = 0; 301 host_err = 0;
300 if (ace.whotype != NFS4_ACL_WHO_NAMED) 302 if (ace->whotype != NFS4_ACL_WHO_NAMED)
301 ace.who = 0; 303 ace->who = 0;
302 else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP) 304 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
303 host_err = nfsd_map_name_to_gid(argp->rqstp, 305 host_err = nfsd_map_name_to_gid(argp->rqstp,
304 buf, dummy32, &ace.who); 306 buf, dummy32, &ace->who);
305 else 307 else
306 host_err = nfsd_map_name_to_uid(argp->rqstp, 308 host_err = nfsd_map_name_to_uid(argp->rqstp,
307 buf, dummy32, &ace.who); 309 buf, dummy32, &ace->who);
308 if (host_err)
309 goto out_nfserr;
310 host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
311 ace.access_mask, ace.whotype, ace.who);
312 if (host_err) 310 if (host_err)
313 goto out_nfserr; 311 goto out_nfserr;
314 } 312 }
@@ -1596,7 +1594,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1596 } 1594 }
1597 if (bmval0 & FATTR4_WORD0_ACL) { 1595 if (bmval0 & FATTR4_WORD0_ACL) {
1598 struct nfs4_ace *ace; 1596 struct nfs4_ace *ace;
1599 struct list_head *h;
1600 1597
1601 if (acl == NULL) { 1598 if (acl == NULL) {
1602 if ((buflen -= 4) < 0) 1599 if ((buflen -= 4) < 0)
@@ -1609,9 +1606,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1609 goto out_resource; 1606 goto out_resource;
1610 WRITE32(acl->naces); 1607 WRITE32(acl->naces);
1611 1608
1612 list_for_each(h, &acl->ace_head) { 1609 for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
1613 ace = list_entry(h, struct nfs4_ace, l_ace);
1614
1615 if ((buflen -= 4*3) < 0) 1610 if ((buflen -= 4*3) < 0)
1616 goto out_resource; 1611 goto out_resource;
1617 WRITE32(ace->type); 1612 WRITE32(ace->type);
@@ -1821,7 +1816,7 @@ out_acl:
1821 status = nfs_ok; 1816 status = nfs_ok;
1822 1817
1823out: 1818out:
1824 nfs4_acl_free(acl); 1819 kfree(acl);
1825 if (fhp == &tempfh) 1820 if (fhp == &tempfh)
1826 fh_put(&tempfh); 1821 fh_put(&tempfh);
1827 return status; 1822 return status;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8283236c6a0f..7e6aa245b5d5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -466,7 +466,10 @@ out:
466 posix_acl_release(dpacl); 466 posix_acl_release(dpacl);
467 return (error); 467 return (error);
468out_nfserr: 468out_nfserr:
469 error = nfserrno(host_error); 469 if (host_error == -EOPNOTSUPP)
470 error = nfserr_attrnotsupp;
471 else
472 error = nfserrno(host_error);
470 goto out; 473 goto out;
471} 474}
472 475
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 7798d2a9f793..916c0102db5b 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -79,6 +79,7 @@ struct acpi_processor_power {
79 u32 bm_activity; 79 u32 bm_activity;
80 int count; 80 int count;
81 struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER]; 81 struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
82 int timer_broadcast_on_state;
82}; 83};
83 84
84/* Performance Management */ 85/* Performance Management */
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index 3a61206fd108..cc6b1652249a 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -95,9 +95,7 @@ static inline void ack_APIC_irq(void)
95 apic_write_around(APIC_EOI, 0); 95 apic_write_around(APIC_EOI, 0);
96} 96}
97 97
98extern void (*wait_timer_tick)(void); 98extern int lapic_get_maxlvt(void);
99
100extern int get_maxlvt(void);
101extern void clear_local_APIC(void); 99extern void clear_local_APIC(void);
102extern void connect_bsp_APIC (void); 100extern void connect_bsp_APIC (void);
103extern void disconnect_bsp_APIC (int virt_wire_setup); 101extern void disconnect_bsp_APIC (int virt_wire_setup);
@@ -113,14 +111,9 @@ extern void smp_local_timer_interrupt (void);
113extern void setup_boot_APIC_clock (void); 111extern void setup_boot_APIC_clock (void);
114extern void setup_secondary_APIC_clock (void); 112extern void setup_secondary_APIC_clock (void);
115extern int APIC_init_uniprocessor (void); 113extern int APIC_init_uniprocessor (void);
116extern void disable_APIC_timer(void);
117extern void enable_APIC_timer(void);
118 114
119extern void enable_NMI_through_LVT0 (void * dummy); 115extern void enable_NMI_through_LVT0 (void * dummy);
120 116
121void smp_send_timer_broadcast_ipi(void);
122void switch_APIC_timer_to_ipi(void *cpumask);
123void switch_ipi_to_APIC_timer(void *cpumask);
124#define ARCH_APICTIMER_STOPS_ON_C3 1 117#define ARCH_APICTIMER_STOPS_ON_C3 1
125 118
126extern int timer_over_8254; 119extern int timer_over_8254;
diff --git a/include/asm-i386/hpet.h b/include/asm-i386/hpet.h
index e47be9a56cc2..fc03cf9de5c4 100644
--- a/include/asm-i386/hpet.h
+++ b/include/asm-i386/hpet.h
@@ -90,16 +90,19 @@
90#define HPET_MIN_PERIOD (100000UL) 90#define HPET_MIN_PERIOD (100000UL)
91#define HPET_TICK_RATE (HZ * 100000UL) 91#define HPET_TICK_RATE (HZ * 100000UL)
92 92
93extern unsigned long hpet_tick; /* hpet clks count per tick */
94extern unsigned long hpet_address; /* hpet memory map physical address */ 93extern unsigned long hpet_address; /* hpet memory map physical address */
95extern int hpet_use_timer; 94extern int is_hpet_enabled(void);
96 95
96#ifdef CONFIG_X86_64
97extern unsigned long hpet_tick; /* hpet clks count per tick */
98extern int hpet_use_timer;
97extern int hpet_rtc_timer_init(void); 99extern int hpet_rtc_timer_init(void);
98extern int hpet_enable(void); 100extern int hpet_enable(void);
99extern int hpet_reenable(void);
100extern int is_hpet_enabled(void);
101extern int is_hpet_capable(void); 101extern int is_hpet_capable(void);
102extern int hpet_readl(unsigned long a); 102extern int hpet_readl(unsigned long a);
103#else
104extern int hpet_enable(void);
105#endif
103 106
104#ifdef CONFIG_HPET_EMULATE_RTC 107#ifdef CONFIG_HPET_EMULATE_RTC
105extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); 108extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
@@ -110,5 +113,10 @@ extern int hpet_rtc_dropped_irq(void);
110extern int hpet_rtc_timer_init(void); 113extern int hpet_rtc_timer_init(void);
111extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); 114extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
112#endif /* CONFIG_HPET_EMULATE_RTC */ 115#endif /* CONFIG_HPET_EMULATE_RTC */
116
117#else
118
119static inline int hpet_enable(void) { return 0; }
120
113#endif /* CONFIG_HPET_TIMER */ 121#endif /* CONFIG_HPET_TIMER */
114#endif /* _I386_HPET_H */ 122#endif /* _I386_HPET_H */
diff --git a/include/asm-i386/i8253.h b/include/asm-i386/i8253.h
index 015d8df07690..6cb0dd4dcdde 100644
--- a/include/asm-i386/i8253.h
+++ b/include/asm-i386/i8253.h
@@ -1,6 +1,21 @@
1#ifndef __ASM_I8253_H__ 1#ifndef __ASM_I8253_H__
2#define __ASM_I8253_H__ 2#define __ASM_I8253_H__
3 3
4#include <linux/clockchips.h>
5
4extern spinlock_t i8253_lock; 6extern spinlock_t i8253_lock;
5 7
8extern struct clock_event_device *global_clock_event;
9
10/**
11 * pit_interrupt_hook - hook into timer tick
12 * @regs: standard registers from interrupt
13 *
14 * Call the global clock event handler.
15 **/
16static inline void pit_interrupt_hook(void)
17{
18 global_clock_event->event_handler(global_clock_event);
19}
20
6#endif /* __ASM_I8253_H__ */ 21#endif /* __ASM_I8253_H__ */
diff --git a/include/asm-i386/mach-default/do_timer.h b/include/asm-i386/mach-default/do_timer.h
index 7d606e3364ae..56e5689863ae 100644
--- a/include/asm-i386/mach-default/do_timer.h
+++ b/include/asm-i386/mach-default/do_timer.h
@@ -1,86 +1,16 @@
1/* defines for inline arch setup functions */ 1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
2 3
3#include <asm/apic.h>
4#include <asm/i8259.h> 4#include <asm/i8259.h>
5#include <asm/i8253.h>
5 6
6/** 7/**
7 * do_timer_interrupt_hook - hook into timer tick 8 * do_timer_interrupt_hook - hook into timer tick
8 * @regs: standard registers from interrupt
9 * 9 *
10 * Description: 10 * Call the pit clock event handler. see asm/i8253.h
11 * This hook is called immediately after the timer interrupt is ack'd.
12 * It's primary purpose is to allow architectures that don't possess
13 * individual per CPU clocks (like the CPU APICs supply) to broadcast the
14 * timer interrupt as a means of triggering reschedules etc.
15 **/ 11 **/
16 12
17static inline void do_timer_interrupt_hook(void) 13static inline void do_timer_interrupt_hook(void)
18{ 14{
19 do_timer(1); 15 pit_interrupt_hook();
20#ifndef CONFIG_SMP
21 update_process_times(user_mode_vm(get_irq_regs()));
22#endif
23/*
24 * In the SMP case we use the local APIC timer interrupt to do the
25 * profiling, except when we simulate SMP mode on a uniprocessor
26 * system, in that case we have to call the local interrupt handler.
27 */
28#ifndef CONFIG_X86_LOCAL_APIC
29 profile_tick(CPU_PROFILING);
30#else
31 if (!using_apic_timer)
32 smp_local_timer_interrupt();
33#endif
34}
35
36
37/* you can safely undefine this if you don't have the Neptune chipset */
38
39#define BUGGY_NEPTUN_TIMER
40
41/**
42 * do_timer_overflow - process a detected timer overflow condition
43 * @count: hardware timer interrupt count on overflow
44 *
45 * Description:
46 * This call is invoked when the jiffies count has not incremented but
47 * the hardware timer interrupt has. It means that a timer tick interrupt
48 * came along while the previous one was pending, thus a tick was missed
49 **/
50static inline int do_timer_overflow(int count)
51{
52 int i;
53
54 spin_lock(&i8259A_lock);
55 /*
56 * This is tricky when I/O APICs are used;
57 * see do_timer_interrupt().
58 */
59 i = inb(0x20);
60 spin_unlock(&i8259A_lock);
61
62 /* assumption about timer being IRQ0 */
63 if (i & 0x01) {
64 /*
65 * We cannot detect lost timer interrupts ...
66 * well, that's why we call them lost, don't we? :)
67 * [hmm, on the Pentium and Alpha we can ... sort of]
68 */
69 count -= LATCH;
70 } else {
71#ifdef BUGGY_NEPTUN_TIMER
72 /*
73 * for the Neptun bug we know that the 'latch'
74 * command doesn't latch the high and low value
75 * of the counter atomically. Thus we have to
76 * substract 256 from the counter
77 * ... funny, isnt it? :)
78 */
79
80 count -= 256;
81#else
82 printk("do_slow_gettimeoffset(): hardware timer problem?\n");
83#endif
84 }
85 return count;
86} 16}
diff --git a/include/asm-i386/mach-voyager/do_timer.h b/include/asm-i386/mach-voyager/do_timer.h
index 04e69c104a74..60f9dcc15d54 100644
--- a/include/asm-i386/mach-voyager/do_timer.h
+++ b/include/asm-i386/mach-voyager/do_timer.h
@@ -1,25 +1,18 @@
1/* defines for inline arch setup functions */ 1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
2#include <asm/voyager.h> 4#include <asm/voyager.h>
5#include <asm/i8253.h>
3 6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 * @regs: standard registers from interrupt
10 *
11 * Call the pit clock event handler. see asm/i8253.h
12 **/
4static inline void do_timer_interrupt_hook(void) 13static inline void do_timer_interrupt_hook(void)
5{ 14{
6 do_timer(1); 15 pit_interrupt_hook();
7#ifndef CONFIG_SMP
8 update_process_times(user_mode_vm(irq_regs));
9#endif
10
11 voyager_timer_interrupt(); 16 voyager_timer_interrupt();
12} 17}
13 18
14static inline int do_timer_overflow(int count)
15{
16 /* can't read the ISR, just assume 1 tick
17 overflow */
18 if(count > LATCH || count < 0) {
19 printk(KERN_ERR "VOYAGER PROBLEM: count is %d, latch is %d\n", count, LATCH);
20 count = LATCH;
21 }
22 count -= LATCH;
23
24 return count;
25}
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index 770bf6da8c3d..f21349399d14 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -23,7 +23,6 @@ extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
23extern int mpc_default_type; 23extern int mpc_default_type;
24extern unsigned long mp_lapic_addr; 24extern unsigned long mp_lapic_addr;
25extern int pic_mode; 25extern int pic_mode;
26extern int using_apic_timer;
27 26
28#ifdef CONFIG_ACPI 27#ifdef CONFIG_ACPI
29extern void mp_register_lapic (u8 id, u8 enabled); 28extern void mp_register_lapic (u8 id, u8 enabled);
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 609a3899475c..6db40d0583f1 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -307,4 +307,7 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
307#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f 307#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f
308#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 308#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390
309 309
310/* Geode defined MSRs */
311#define MSR_GEODE_BUSCONT_CONF0 0x1900
312
310#endif /* __ASM_MSR_H */ 313#endif /* __ASM_MSR_H */
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index c13933185c1c..e997891cc7cc 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -1,48 +1 @@
1/* #include <asm-x86_64/tsc.h>
2 * linux/include/asm-i386/tsc.h
3 *
4 * i386 TSC related functions
5 */
6#ifndef _ASM_i386_TSC_H
7#define _ASM_i386_TSC_H
8
9#include <asm/processor.h>
10
11/*
12 * Standard way to access the cycle counter on i586+ CPUs.
13 * Currently only used on SMP.
14 *
15 * If you really have a SMP machine with i486 chips or older,
16 * compile for that, and this will just always return zero.
17 * That's ok, it just means that the nicer scheduling heuristics
18 * won't work for you.
19 *
20 * We only use the low 32 bits, and we'd simply better make sure
21 * that we reschedule before that wraps. Scheduling at least every
22 * four billion cycles just basically sounds like a good idea,
23 * regardless of how fast the machine is.
24 */
25typedef unsigned long long cycles_t;
26
27extern unsigned int cpu_khz;
28extern unsigned int tsc_khz;
29
30static inline cycles_t get_cycles(void)
31{
32 unsigned long long ret = 0;
33
34#ifndef CONFIG_X86_TSC
35 if (!cpu_has_tsc)
36 return 0;
37#endif
38
39#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
40 rdtscll(ret);
41#endif
42 return ret;
43}
44
45extern void tsc_init(void);
46extern void mark_tsc_unstable(void);
47
48#endif
diff --git a/include/asm-x86_64/hpet.h b/include/asm-x86_64/hpet.h
index b39098408b69..59a66f084611 100644
--- a/include/asm-x86_64/hpet.h
+++ b/include/asm-x86_64/hpet.h
@@ -56,8 +56,15 @@
56extern int is_hpet_enabled(void); 56extern int is_hpet_enabled(void);
57extern int hpet_rtc_timer_init(void); 57extern int hpet_rtc_timer_init(void);
58extern int apic_is_clustered_box(void); 58extern int apic_is_clustered_box(void);
59extern int hpet_arch_init(void);
60extern int hpet_timer_stop_set_go(unsigned long tick);
61extern int hpet_reenable(void);
62extern unsigned int hpet_calibrate_tsc(void);
59 63
60extern int hpet_use_timer; 64extern int hpet_use_timer;
65extern unsigned long hpet_address;
66extern unsigned long hpet_period;
67extern unsigned long hpet_tick;
61 68
62#ifdef CONFIG_HPET_EMULATE_RTC 69#ifdef CONFIG_HPET_EMULATE_RTC
63extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); 70extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index a6d2ff5c69b7..f54f3abf93ce 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -45,11 +45,7 @@ extern u32 pmtmr_ioport;
45#else 45#else
46#define pmtmr_ioport 0 46#define pmtmr_ioport 0
47#endif 47#endif
48extern unsigned long long monotonic_base;
49extern int sysctl_vsyscall;
50extern int nohpet; 48extern int nohpet;
51extern unsigned long vxtime_hz;
52extern void time_init_gtod(void);
53 49
54extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2))); 50extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
55 51
@@ -91,8 +87,6 @@ extern void check_efer(void);
91 87
92extern int unhandled_signal(struct task_struct *tsk, int sig); 88extern int unhandled_signal(struct task_struct *tsk, int sig);
93 89
94extern int unsynchronized_tsc(void);
95
96extern void select_idle_routine(const struct cpuinfo_x86 *c); 90extern void select_idle_routine(const struct cpuinfo_x86 *c);
97 91
98extern unsigned long table_start, table_end; 92extern unsigned long table_start, table_end;
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index b9e5320b7625..8c6808a3fba4 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -12,38 +12,21 @@
12#include <asm/hpet.h> 12#include <asm/hpet.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/processor.h> 14#include <asm/processor.h>
15#include <asm/tsc.h>
15#include <linux/compiler.h> 16#include <linux/compiler.h>
16 17
17#define CLOCK_TICK_RATE PIT_TICK_RATE /* Underlying HZ */ 18#define CLOCK_TICK_RATE PIT_TICK_RATE /* Underlying HZ */
18 19
19typedef unsigned long long cycles_t;
20
21static inline cycles_t get_cycles (void)
22{
23 unsigned long long ret;
24
25 rdtscll(ret);
26 return ret;
27}
28
29/* Like get_cycles, but make sure the CPU is synchronized. */
30static __always_inline cycles_t get_cycles_sync(void)
31{
32 unsigned long long ret;
33 unsigned eax;
34 /* Don't do an additional sync on CPUs where we know
35 RDTSC is already synchronous. */
36 alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
37 "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
38 rdtscll(ret);
39 return ret;
40}
41
42extern unsigned int cpu_khz;
43
44extern int read_current_timer(unsigned long *timer_value); 20extern int read_current_timer(unsigned long *timer_value);
45#define ARCH_HAS_READ_CURRENT_TIMER 1 21#define ARCH_HAS_READ_CURRENT_TIMER 1
46 22
47extern struct vxtime_data vxtime; 23#define USEC_PER_TICK (USEC_PER_SEC / HZ)
24#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
25#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
26
27#define NS_SCALE 10 /* 2^10, carefully chosen */
28#define US_SCALE 32 /* 2^32, arbitralrily chosen */
48 29
30extern void mark_tsc_unstable(void);
31extern void set_cyc2ns_scale(unsigned long khz);
49#endif 32#endif
diff --git a/include/asm-x86_64/tsc.h b/include/asm-x86_64/tsc.h
new file mode 100644
index 000000000000..9a0a368852c7
--- /dev/null
+++ b/include/asm-x86_64/tsc.h
@@ -0,0 +1,66 @@
1/*
2 * linux/include/asm-x86_64/tsc.h
3 *
4 * x86_64 TSC related functions
5 */
6#ifndef _ASM_x86_64_TSC_H
7#define _ASM_x86_64_TSC_H
8
9#include <asm/processor.h>
10
11/*
12 * Standard way to access the cycle counter.
13 */
14typedef unsigned long long cycles_t;
15
16extern unsigned int cpu_khz;
17extern unsigned int tsc_khz;
18
19static inline cycles_t get_cycles(void)
20{
21 unsigned long long ret = 0;
22
23#ifndef CONFIG_X86_TSC
24 if (!cpu_has_tsc)
25 return 0;
26#endif
27
28#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
29 rdtscll(ret);
30#endif
31 return ret;
32}
33
34/* Like get_cycles, but make sure the CPU is synchronized. */
35static __always_inline cycles_t get_cycles_sync(void)
36{
37 unsigned long long ret;
38#ifdef X86_FEATURE_SYNC_RDTSC
39 unsigned eax;
40
41 /*
42 * Don't do an additional sync on CPUs where we know
43 * RDTSC is already synchronous:
44 */
45 alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
46 "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
47#else
48 sync_core();
49#endif
50 rdtscll(ret);
51
52 return ret;
53}
54
55extern void tsc_init(void);
56extern void mark_tsc_unstable(void);
57extern int unsynchronized_tsc(void);
58
59/*
60 * Boot-time check whether the TSCs are synchronized across
61 * all CPUs/cores:
62 */
63extern void check_tsc_sync_source(int cpu);
64extern void check_tsc_sync_target(void);
65
66#endif
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index 0c7847165eae..82b4afe65c91 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -16,46 +16,27 @@ enum vsyscall_num {
16#ifdef __KERNEL__ 16#ifdef __KERNEL__
17#include <linux/seqlock.h> 17#include <linux/seqlock.h>
18 18
19#define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
20#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) 19#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
21#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) 20#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
22#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16)))
23#define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
24#define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
25#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16)))
26 21
27#define VXTIME_TSC 1 22/* Definitions for CONFIG_GENERIC_TIME definitions */
28#define VXTIME_HPET 2 23#define __section_vsyscall_gtod_data __attribute__ \
29#define VXTIME_PMTMR 3 24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
30 26
31#define VGETCPU_RDTSCP 1 27#define VGETCPU_RDTSCP 1
32#define VGETCPU_LSL 2 28#define VGETCPU_LSL 2
33 29
34struct vxtime_data {
35 long hpet_address; /* HPET base address */
36 int last;
37 unsigned long last_tsc;
38 long quot;
39 long tsc_quot;
40 int mode;
41};
42
43#define hpet_readl(a) readl((const void __iomem *)fix_to_virt(FIX_HPET_BASE) + a) 30#define hpet_readl(a) readl((const void __iomem *)fix_to_virt(FIX_HPET_BASE) + a)
44#define hpet_writel(d,a) writel(d, (void __iomem *)fix_to_virt(FIX_HPET_BASE) + a) 31#define hpet_writel(d,a) writel(d, (void __iomem *)fix_to_virt(FIX_HPET_BASE) + a)
45 32
46/* vsyscall space (readonly) */
47extern struct vxtime_data __vxtime;
48extern int __vgetcpu_mode; 33extern int __vgetcpu_mode;
49extern struct timespec __xtime;
50extern volatile unsigned long __jiffies; 34extern volatile unsigned long __jiffies;
51extern struct timezone __sys_tz;
52extern seqlock_t __xtime_lock;
53 35
54/* kernel space (writeable) */ 36/* kernel space (writeable) */
55extern struct vxtime_data vxtime;
56extern int vgetcpu_mode; 37extern int vgetcpu_mode;
57extern struct timezone sys_tz; 38extern struct timezone sys_tz;
58extern int sysctl_vsyscall; 39extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
59 40
60#endif /* __KERNEL__ */ 41#endif /* __KERNEL__ */
61 42
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
new file mode 100644
index 000000000000..1d0ef1ae8036
--- /dev/null
+++ b/include/linux/acpi_pmtmr.h
@@ -0,0 +1,38 @@
1#ifndef _ACPI_PMTMR_H_
2#define _ACPI_PMTMR_H_
3
4#include <linux/clocksource.h>
5
6/* Number of PMTMR ticks expected during calibration run */
7#define PMTMR_TICKS_PER_SEC 3579545
8
9/* limit it to 24 bits */
10#define ACPI_PM_MASK CLOCKSOURCE_MASK(24)
11
12/* Overrun value */
13#define ACPI_PM_OVRRUN (1<<24)
14
15#ifdef CONFIG_X86_PM_TIMER
16
17extern u32 acpi_pm_read_verified(void);
18extern u32 pmtmr_ioport;
19
20static inline u32 acpi_pm_read_early(void)
21{
22 if (!pmtmr_ioport)
23 return 0;
24 /* mask the output to 24 bits */
25 return acpi_pm_read_verified() & ACPI_PM_MASK;
26}
27
28#else
29
30static inline u32 acpi_pm_read_early(void)
31{
32 return 0;
33}
34
35#endif
36
37#endif
38
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index a5c8bb5d80ba..abc521cfb084 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -87,10 +87,15 @@ struct agp_memory {
87 u32 physical; 87 u32 physical;
88 u8 is_bound; 88 u8 is_bound;
89 u8 is_flushed; 89 u8 is_flushed;
90 u8 vmalloc_flag;
90}; 91};
91 92
92#define AGP_NORMAL_MEMORY 0 93#define AGP_NORMAL_MEMORY 0
93 94
95#define AGP_USER_TYPES (1 << 16)
96#define AGP_USER_MEMORY (AGP_USER_TYPES)
97#define AGP_USER_CACHED_MEMORY (AGP_USER_TYPES + 1)
98
94extern struct agp_bridge_data *agp_bridge; 99extern struct agp_bridge_data *agp_bridge;
95extern struct list_head agp_bridges; 100extern struct list_head agp_bridges;
96 101
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
new file mode 100644
index 000000000000..4ea7e7bcfafe
--- /dev/null
+++ b/include/linux/clockchips.h
@@ -0,0 +1,142 @@
1/* linux/include/linux/clockchips.h
2 *
3 * This file contains the structure definitions for clockchips.
4 *
5 * If you are not a clockchip, or the time of day code, you should
6 * not be including this file!
7 */
8#ifndef _LINUX_CLOCKCHIPS_H
9#define _LINUX_CLOCKCHIPS_H
10
11#ifdef CONFIG_GENERIC_CLOCKEVENTS
12
13#include <linux/clocksource.h>
14#include <linux/cpumask.h>
15#include <linux/ktime.h>
16#include <linux/notifier.h>
17
18struct clock_event_device;
19
20/* Clock event mode commands */
21enum clock_event_mode {
22 CLOCK_EVT_MODE_UNUSED = 0,
23 CLOCK_EVT_MODE_SHUTDOWN,
24 CLOCK_EVT_MODE_PERIODIC,
25 CLOCK_EVT_MODE_ONESHOT,
26};
27
28/* Clock event notification values */
29enum clock_event_nofitiers {
30 CLOCK_EVT_NOTIFY_ADD,
31 CLOCK_EVT_NOTIFY_BROADCAST_ON,
32 CLOCK_EVT_NOTIFY_BROADCAST_OFF,
33 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
34 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
35 CLOCK_EVT_NOTIFY_SUSPEND,
36 CLOCK_EVT_NOTIFY_RESUME,
37 CLOCK_EVT_NOTIFY_CPU_DEAD,
38};
39
40/*
41 * Clock event features
42 */
43#define CLOCK_EVT_FEAT_PERIODIC 0x000001
44#define CLOCK_EVT_FEAT_ONESHOT 0x000002
45/*
46 * x86(64) specific misfeatures:
47 *
48 * - Clockevent source stops in C3 State and needs broadcast support.
49 * - Local APIC timer is used as a dummy device.
50 */
51#define CLOCK_EVT_FEAT_C3STOP 0x000004
52#define CLOCK_EVT_FEAT_DUMMY 0x000008
53
54/**
55 * struct clock_event_device - clock event device descriptor
56 * @name: ptr to clock event name
57 * @hints: usage hints
58 * @max_delta_ns: maximum delta value in ns
59 * @min_delta_ns: minimum delta value in ns
60 * @mult: nanosecond to cycles multiplier
61 * @shift: nanoseconds to cycles divisor (power of two)
62 * @rating: variable to rate clock event devices
63 * @irq: irq number (only for non cpu local devices)
64 * @cpumask: cpumask to indicate for which cpus this device works
65 * @set_next_event: set next event
66 * @set_mode: set mode function
67 * @evthandler: Assigned by the framework to be called by the low
68 * level handler of the event source
69 * @broadcast: function to broadcast events
70 * @list: list head for the management code
71 * @mode: operating mode assigned by the management code
72 * @next_event: local storage for the next event in oneshot mode
73 */
74struct clock_event_device {
75 const char *name;
76 unsigned int features;
77 unsigned long max_delta_ns;
78 unsigned long min_delta_ns;
79 unsigned long mult;
80 int shift;
81 int rating;
82 int irq;
83 cpumask_t cpumask;
84 int (*set_next_event)(unsigned long evt,
85 struct clock_event_device *);
86 void (*set_mode)(enum clock_event_mode mode,
87 struct clock_event_device *);
88 void (*event_handler)(struct clock_event_device *);
89 void (*broadcast)(cpumask_t mask);
90 struct list_head list;
91 enum clock_event_mode mode;
92 ktime_t next_event;
93};
94
95/*
96 * Calculate a multiplication factor for scaled math, which is used to convert
97 * nanoseconds based values to clock ticks:
98 *
99 * clock_ticks = (nanoseconds * factor) >> shift.
100 *
101 * div_sc is the rearranged equation to calculate a factor from a given clock
102 * ticks / nanoseconds ratio:
103 *
104 * factor = (clock_ticks << shift) / nanoseconds
105 */
106static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec,
107 int shift)
108{
109 uint64_t tmp = ((uint64_t)ticks) << shift;
110
111 do_div(tmp, nsec);
112 return (unsigned long) tmp;
113}
114
115/* Clock event layer functions */
116extern unsigned long clockevent_delta2ns(unsigned long latch,
117 struct clock_event_device *evt);
118extern void clockevents_register_device(struct clock_event_device *dev);
119
120extern void clockevents_exchange_device(struct clock_event_device *old,
121 struct clock_event_device *new);
122extern
123struct clock_event_device *clockevents_request_device(unsigned int features,
124 cpumask_t cpumask);
125extern void clockevents_release_device(struct clock_event_device *dev);
126extern void clockevents_set_mode(struct clock_event_device *dev,
127 enum clock_event_mode mode);
128extern int clockevents_register_notifier(struct notifier_block *nb);
129extern void clockevents_unregister_notifier(struct notifier_block *nb);
130extern int clockevents_program_event(struct clock_event_device *dev,
131 ktime_t expires, ktime_t now);
132
133extern void clockevents_notify(unsigned long reason, void *arg);
134
135#else
136
137static inline void clockevents_resume_events(void) { }
138#define clockevents_notify(reason, arg) do { } while (0)
139
140#endif
141
142#endif
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 1622d23a8dc3..daa4940cc0f1 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -12,11 +12,13 @@
12#include <linux/timex.h> 12#include <linux/timex.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/list.h> 14#include <linux/list.h>
15#include <linux/timer.h>
15#include <asm/div64.h> 16#include <asm/div64.h>
16#include <asm/io.h> 17#include <asm/io.h>
17 18
18/* clocksource cycle base type */ 19/* clocksource cycle base type */
19typedef u64 cycle_t; 20typedef u64 cycle_t;
21struct clocksource;
20 22
21/** 23/**
22 * struct clocksource - hardware abstraction for a free running counter 24 * struct clocksource - hardware abstraction for a free running counter
@@ -44,8 +46,8 @@ typedef u64 cycle_t;
44 * subtraction of non 64 bit counters 46 * subtraction of non 64 bit counters
45 * @mult: cycle to nanosecond multiplier 47 * @mult: cycle to nanosecond multiplier
46 * @shift: cycle to nanosecond divisor (power of two) 48 * @shift: cycle to nanosecond divisor (power of two)
47 * @update_callback: called when safe to alter clocksource values 49 * @flags: flags describing special properties
48 * @is_continuous: defines if clocksource is free-running. 50 * @vread: vsyscall based read
49 * @cycle_interval: Used internally by timekeeping core, please ignore. 51 * @cycle_interval: Used internally by timekeeping core, please ignore.
50 * @xtime_interval: Used internally by timekeeping core, please ignore. 52 * @xtime_interval: Used internally by timekeeping core, please ignore.
51 */ 53 */
@@ -57,15 +59,30 @@ struct clocksource {
57 cycle_t mask; 59 cycle_t mask;
58 u32 mult; 60 u32 mult;
59 u32 shift; 61 u32 shift;
60 int (*update_callback)(void); 62 unsigned long flags;
61 int is_continuous; 63 cycle_t (*vread)(void);
62 64
63 /* timekeeping specific data, ignore */ 65 /* timekeeping specific data, ignore */
64 cycle_t cycle_last, cycle_interval; 66 cycle_t cycle_last, cycle_interval;
65 u64 xtime_nsec, xtime_interval; 67 u64 xtime_nsec, xtime_interval;
66 s64 error; 68 s64 error;
69
70#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
71 /* Watchdog related data, used by the framework */
72 struct list_head wd_list;
73 cycle_t wd_last;
74#endif
67}; 75};
68 76
77/*
78 * Clock source flags bits::
79 */
80#define CLOCK_SOURCE_IS_CONTINUOUS 0x01
81#define CLOCK_SOURCE_MUST_VERIFY 0x02
82
83#define CLOCK_SOURCE_WATCHDOG 0x10
84#define CLOCK_SOURCE_VALID_FOR_HRES 0x20
85
69/* simplify initialization of mask field */ 86/* simplify initialization of mask field */
70#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1) 87#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1)
71 88
@@ -178,8 +195,16 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
178 195
179 196
180/* used to install a new clocksource */ 197/* used to install a new clocksource */
181int clocksource_register(struct clocksource*); 198extern int clocksource_register(struct clocksource*);
182void clocksource_reselect(void); 199extern struct clocksource* clocksource_get_next(void);
183struct clocksource* clocksource_get_next(void); 200extern void clocksource_change_rating(struct clocksource *cs, int rating);
201
202#ifdef CONFIG_GENERIC_TIME_VSYSCALL
203extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
204#else
205static inline void update_vsyscall(struct timespec *ts, struct clocksource *c)
206{
207}
208#endif
184 209
185#endif /* _LINUX_CLOCKSOURCE_H */ 210#endif /* _LINUX_CLOCKSOURCE_H */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 7f008f6bfdc3..0899e2cdcdd1 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -84,9 +84,6 @@ struct cpufreq_policy {
84 unsigned int policy; /* see above */ 84 unsigned int policy; /* see above */
85 struct cpufreq_governor *governor; /* see below */ 85 struct cpufreq_governor *governor; /* see below */
86 86
87 struct mutex lock; /* CPU ->setpolicy or ->target may
88 only be called once a time */
89
90 struct work_struct update; /* if update_policy() needs to be 87 struct work_struct update; /* if update_policy() needs to be
91 * called, but you're in IRQ context */ 88 * called, but you're in IRQ context */
92 89
@@ -172,11 +169,16 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
172 unsigned int relation); 169 unsigned int relation);
173 170
174 171
175extern int cpufreq_driver_getavg(struct cpufreq_policy *policy); 172extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy);
176 173
177int cpufreq_register_governor(struct cpufreq_governor *governor); 174int cpufreq_register_governor(struct cpufreq_governor *governor);
178void cpufreq_unregister_governor(struct cpufreq_governor *governor); 175void cpufreq_unregister_governor(struct cpufreq_governor *governor);
179 176
177int lock_policy_rwsem_read(int cpu);
178int lock_policy_rwsem_write(int cpu);
179void unlock_policy_rwsem_read(int cpu);
180void unlock_policy_rwsem_write(int cpu);
181
180 182
181/********************************************************************* 183/*********************************************************************
182 * CPUFREQ DRIVER INTERFACE * 184 * CPUFREQ DRIVER INTERFACE *
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 612472aaa79c..7803014f3a11 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -106,7 +106,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
106 * always balanced, so the interrupted value of ->hardirq_context 106 * always balanced, so the interrupted value of ->hardirq_context
107 * will always be restored. 107 * will always be restored.
108 */ 108 */
109#define irq_enter() \ 109#define __irq_enter() \
110 do { \ 110 do { \
111 account_system_vtime(current); \ 111 account_system_vtime(current); \
112 add_preempt_count(HARDIRQ_OFFSET); \ 112 add_preempt_count(HARDIRQ_OFFSET); \
@@ -114,6 +114,11 @@ static inline void account_system_vtime(struct task_struct *tsk)
114 } while (0) 114 } while (0)
115 115
116/* 116/*
117 * Enter irq context (on NO_HZ, update jiffies):
118 */
119extern void irq_enter(void);
120
121/*
117 * Exit irq context without processing softirqs: 122 * Exit irq context without processing softirqs:
118 */ 123 */
119#define __irq_exit() \ 124#define __irq_exit() \
@@ -128,7 +133,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
128 */ 133 */
129extern void irq_exit(void); 134extern void irq_exit(void);
130 135
131#define nmi_enter() do { lockdep_off(); irq_enter(); } while (0) 136#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0)
132#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) 137#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0)
133 138
134#endif /* LINUX_HARDIRQ_H */ 139#endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fca93025ab51..37f9279192a9 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -21,22 +21,72 @@
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/wait.h> 22#include <linux/wait.h>
23 23
24struct hrtimer_clock_base;
25struct hrtimer_cpu_base;
26
24/* 27/*
25 * Mode arguments of xxx_hrtimer functions: 28 * Mode arguments of xxx_hrtimer functions:
26 */ 29 */
27enum hrtimer_mode { 30enum hrtimer_mode {
28 HRTIMER_ABS, /* Time value is absolute */ 31 HRTIMER_MODE_ABS, /* Time value is absolute */
29 HRTIMER_REL, /* Time value is relative to now */ 32 HRTIMER_MODE_REL, /* Time value is relative to now */
30}; 33};
31 34
35/*
36 * Return values for the callback function
37 */
32enum hrtimer_restart { 38enum hrtimer_restart {
33 HRTIMER_NORESTART, 39 HRTIMER_NORESTART, /* Timer is not restarted */
34 HRTIMER_RESTART, 40 HRTIMER_RESTART, /* Timer must be restarted */
35}; 41};
36 42
37#define HRTIMER_INACTIVE ((void *)1UL) 43/*
44 * hrtimer callback modes:
45 *
46 * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context
47 * HRTIMER_CB_IRQSAFE: Callback may run in hardirq context
48 * HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and
49 * does not restart the timer
50 * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: Callback must run in softirq context
51 * Special mode for tick emultation
52 */
53enum hrtimer_cb_mode {
54 HRTIMER_CB_SOFTIRQ,
55 HRTIMER_CB_IRQSAFE,
56 HRTIMER_CB_IRQSAFE_NO_RESTART,
57 HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
58};
38 59
39struct hrtimer_base; 60/*
61 * Values to track state of the timer
62 *
63 * Possible states:
64 *
65 * 0x00 inactive
66 * 0x01 enqueued into rbtree
67 * 0x02 callback function running
68 * 0x04 callback pending (high resolution mode)
69 *
70 * Special case:
71 * 0x03 callback function running and enqueued
72 * (was requeued on another CPU)
73 * The "callback function running and enqueued" status is only possible on
74 * SMP. It happens for example when a posix timer expired and the callback
75 * queued a signal. Between dropping the lock which protects the posix timer
76 * and reacquiring the base lock of the hrtimer, another CPU can deliver the
77 * signal and rearm the timer. We have to preserve the callback running state,
78 * as otherwise the timer could be removed before the softirq code finishes the
79 * the handling of the timer.
80 *
81 * The HRTIMER_STATE_ENQUEUE bit is always or'ed to the current state to
82 * preserve the HRTIMER_STATE_CALLBACK bit in the above scenario.
83 *
84 * All state transitions are protected by cpu_base->lock.
85 */
86#define HRTIMER_STATE_INACTIVE 0x00
87#define HRTIMER_STATE_ENQUEUED 0x01
88#define HRTIMER_STATE_CALLBACK 0x02
89#define HRTIMER_STATE_PENDING 0x04
40 90
41/** 91/**
42 * struct hrtimer - the basic hrtimer structure 92 * struct hrtimer - the basic hrtimer structure
@@ -46,14 +96,34 @@ struct hrtimer_base;
46 * which the timer is based. 96 * which the timer is based.
47 * @function: timer expiry callback function 97 * @function: timer expiry callback function
48 * @base: pointer to the timer base (per cpu and per clock) 98 * @base: pointer to the timer base (per cpu and per clock)
99 * @state: state information (See bit values above)
100 * @cb_mode: high resolution timer feature to select the callback execution
101 * mode
102 * @cb_entry: list head to enqueue an expired timer into the callback list
103 * @start_site: timer statistics field to store the site where the timer
104 * was started
105 * @start_comm: timer statistics field to store the name of the process which
106 * started the timer
107 * @start_pid: timer statistics field to store the pid of the task which
108 * started the timer
49 * 109 *
50 * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE() 110 * The hrtimer structure must be initialized by hrtimer_init()
51 */ 111 */
52struct hrtimer { 112struct hrtimer {
53 struct rb_node node; 113 struct rb_node node;
54 ktime_t expires; 114 ktime_t expires;
55 int (*function)(struct hrtimer *); 115 enum hrtimer_restart (*function)(struct hrtimer *);
56 struct hrtimer_base *base; 116 struct hrtimer_clock_base *base;
117 unsigned long state;
118#ifdef CONFIG_HIGH_RES_TIMERS
119 enum hrtimer_cb_mode cb_mode;
120 struct list_head cb_entry;
121#endif
122#ifdef CONFIG_TIMER_STATS
123 void *start_site;
124 char start_comm[16];
125 int start_pid;
126#endif
57}; 127};
58 128
59/** 129/**
@@ -70,37 +140,114 @@ struct hrtimer_sleeper {
70 140
71/** 141/**
72 * struct hrtimer_base - the timer base for a specific clock 142 * struct hrtimer_base - the timer base for a specific clock
73 * @index: clock type index for per_cpu support when moving a timer 143 * @index: clock type index for per_cpu support when moving a
74 * to a base on another cpu. 144 * timer to a base on another cpu.
75 * @lock: lock protecting the base and associated timers
76 * @active: red black tree root node for the active timers 145 * @active: red black tree root node for the active timers
77 * @first: pointer to the timer node which expires first 146 * @first: pointer to the timer node which expires first
78 * @resolution: the resolution of the clock, in nanoseconds 147 * @resolution: the resolution of the clock, in nanoseconds
79 * @get_time: function to retrieve the current time of the clock 148 * @get_time: function to retrieve the current time of the clock
80 * @get_softirq_time: function to retrieve the current time from the softirq 149 * @get_softirq_time: function to retrieve the current time from the softirq
81 * @curr_timer: the timer which is executing a callback right now
82 * @softirq_time: the time when running the hrtimer queue in the softirq 150 * @softirq_time: the time when running the hrtimer queue in the softirq
83 * @lock_key: the lock_class_key for use with lockdep 151 * @cb_pending: list of timers where the callback is pending
152 * @offset: offset of this clock to the monotonic base
153 * @reprogram: function to reprogram the timer event
84 */ 154 */
85struct hrtimer_base { 155struct hrtimer_clock_base {
156 struct hrtimer_cpu_base *cpu_base;
86 clockid_t index; 157 clockid_t index;
87 spinlock_t lock;
88 struct rb_root active; 158 struct rb_root active;
89 struct rb_node *first; 159 struct rb_node *first;
90 ktime_t resolution; 160 ktime_t resolution;
91 ktime_t (*get_time)(void); 161 ktime_t (*get_time)(void);
92 ktime_t (*get_softirq_time)(void); 162 ktime_t (*get_softirq_time)(void);
93 struct hrtimer *curr_timer;
94 ktime_t softirq_time; 163 ktime_t softirq_time;
95 struct lock_class_key lock_key; 164#ifdef CONFIG_HIGH_RES_TIMERS
165 ktime_t offset;
166 int (*reprogram)(struct hrtimer *t,
167 struct hrtimer_clock_base *b,
168 ktime_t n);
169#endif
170};
171
172#define HRTIMER_MAX_CLOCK_BASES 2
173
174/*
175 * struct hrtimer_cpu_base - the per cpu clock bases
176 * @lock: lock protecting the base and associated clock bases
177 * and timers
178 * @lock_key: the lock_class_key for use with lockdep
179 * @clock_base: array of clock bases for this cpu
180 * @curr_timer: the timer which is executing a callback right now
181 * @expires_next: absolute time of the next event which was scheduled
182 * via clock_set_next_event()
183 * @hres_active: State of high resolution mode
184 * @check_clocks: Indictator, when set evaluate time source and clock
185 * event devices whether high resolution mode can be
186 * activated.
187 * @cb_pending: Expired timers are moved from the rbtree to this
188 * list in the timer interrupt. The list is processed
189 * in the softirq.
190 * @nr_events: Total number of timer interrupt events
191 */
192struct hrtimer_cpu_base {
193 spinlock_t lock;
194 struct lock_class_key lock_key;
195 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
196#ifdef CONFIG_HIGH_RES_TIMERS
197 ktime_t expires_next;
198 int hres_active;
199 struct list_head cb_pending;
200 unsigned long nr_events;
201#endif
96}; 202};
97 203
204#ifdef CONFIG_HIGH_RES_TIMERS
205struct clock_event_device;
206
207extern void clock_was_set(void);
208extern void hrtimer_interrupt(struct clock_event_device *dev);
209
210/*
211 * In high resolution mode the time reference must be read accurate
212 */
213static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
214{
215 return timer->base->get_time();
216}
217
218/*
219 * The resolution of the clocks. The resolution value is returned in
220 * the clock_getres() system call to give application programmers an
221 * idea of the (in)accuracy of timers. Timer values are rounded up to
222 * this resolution values.
223 */
224# define KTIME_HIGH_RES (ktime_t) { .tv64 = 1 }
225# define KTIME_MONOTONIC_RES KTIME_HIGH_RES
226
227#else
228
229# define KTIME_MONOTONIC_RES KTIME_LOW_RES
230
98/* 231/*
99 * clock_was_set() is a NOP for non- high-resolution systems. The 232 * clock_was_set() is a NOP for non- high-resolution systems. The
100 * time-sorted order guarantees that a timer does not expire early and 233 * time-sorted order guarantees that a timer does not expire early and
101 * is expired in the next softirq when the clock was advanced. 234 * is expired in the next softirq when the clock was advanced.
102 */ 235 */
103#define clock_was_set() do { } while (0) 236static inline void clock_was_set(void) { }
237
238/*
239 * In non high resolution mode the time reference is taken from
240 * the base softirq time variable.
241 */
242static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
243{
244 return timer->base->softirq_time;
245}
246
247#endif
248
249extern ktime_t ktime_get(void);
250extern ktime_t ktime_get_real(void);
104 251
105/* Exported timer functions: */ 252/* Exported timer functions: */
106 253
@@ -114,19 +261,33 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
114extern int hrtimer_cancel(struct hrtimer *timer); 261extern int hrtimer_cancel(struct hrtimer *timer);
115extern int hrtimer_try_to_cancel(struct hrtimer *timer); 262extern int hrtimer_try_to_cancel(struct hrtimer *timer);
116 263
117#define hrtimer_restart(timer) hrtimer_start((timer), (timer)->expires, HRTIMER_ABS) 264static inline int hrtimer_restart(struct hrtimer *timer)
265{
266 return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
267}
118 268
119/* Query timers: */ 269/* Query timers: */
120extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); 270extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
121extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); 271extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
122 272
123#ifdef CONFIG_NO_IDLE_HZ
124extern ktime_t hrtimer_get_next_event(void); 273extern ktime_t hrtimer_get_next_event(void);
125#endif
126 274
275/*
276 * A timer is active, when it is enqueued into the rbtree or the callback
277 * function is running.
278 */
127static inline int hrtimer_active(const struct hrtimer *timer) 279static inline int hrtimer_active(const struct hrtimer *timer)
128{ 280{
129 return rb_parent(&timer->node) != &timer->node; 281 return timer->state != HRTIMER_STATE_INACTIVE;
282}
283
284/*
285 * Helper function to check, whether the timer is on one of the queues
286 */
287static inline int hrtimer_is_queued(struct hrtimer *timer)
288{
289 return timer->state &
290 (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING);
130} 291}
131 292
132/* Forward a hrtimer so it expires after now: */ 293/* Forward a hrtimer so it expires after now: */
@@ -149,4 +310,53 @@ extern void hrtimer_run_queues(void);
149/* Bootup initialization: */ 310/* Bootup initialization: */
150extern void __init hrtimers_init(void); 311extern void __init hrtimers_init(void);
151 312
313#if BITS_PER_LONG < 64
314extern unsigned long ktime_divns(const ktime_t kt, s64 div);
315#else /* BITS_PER_LONG < 64 */
316# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
317#endif
318
319/* Show pending timers: */
320extern void sysrq_timer_list_show(void);
321
322/*
323 * Timer-statistics info:
324 */
325#ifdef CONFIG_TIMER_STATS
326
327extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
328 void *timerf, char * comm);
329
330static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
331{
332 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
333 timer->function, timer->start_comm);
334}
335
336extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer,
337 void *addr);
338
339static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
340{
341 __timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0));
342}
343
344static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
345{
346 timer->start_site = NULL;
347}
348#else
349static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
350{
351}
352
353static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
354{
355}
356
357static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
358{
359}
360#endif
361
152#endif 362#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5a8ba0b8ccba..e5ea1411050b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -42,6 +42,8 @@
42 * IRQF_SHARED - allow sharing the irq among several devices 42 * IRQF_SHARED - allow sharing the irq among several devices
43 * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur 43 * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur
44 * IRQF_TIMER - Flag to mark this interrupt as timer interrupt 44 * IRQF_TIMER - Flag to mark this interrupt as timer interrupt
45 * IRQF_PERCPU - Interrupt is per cpu
46 * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing
45 */ 47 */
46#define IRQF_DISABLED 0x00000020 48#define IRQF_DISABLED 0x00000020
47#define IRQF_SAMPLE_RANDOM 0x00000040 49#define IRQF_SAMPLE_RANDOM 0x00000040
@@ -49,6 +51,7 @@
49#define IRQF_PROBE_SHARED 0x00000100 51#define IRQF_PROBE_SHARED 0x00000100
50#define IRQF_TIMER 0x00000200 52#define IRQF_TIMER 0x00000200
51#define IRQF_PERCPU 0x00000400 53#define IRQF_PERCPU 0x00000400
54#define IRQF_NOBALANCING 0x00000800
52 55
53/* 56/*
54 * Migration helpers. Scheduled for removal in 1/2007 57 * Migration helpers. Scheduled for removal in 1/2007
@@ -239,6 +242,9 @@ enum
239 BLOCK_SOFTIRQ, 242 BLOCK_SOFTIRQ,
240 TASKLET_SOFTIRQ, 243 TASKLET_SOFTIRQ,
241 SCHED_SOFTIRQ, 244 SCHED_SOFTIRQ,
245#ifdef CONFIG_HIGH_RES_TIMERS
246 HRTIMER_SOFTIRQ,
247#endif
242}; 248};
243 249
244/* softirq mask and active fields moved to irq_cpustat_t in 250/* softirq mask and active fields moved to irq_cpustat_t in
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5504b671357f..1939d42c21d2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -31,7 +31,7 @@ typedef void fastcall (*irq_flow_handler_t)(unsigned int irq,
31/* 31/*
32 * IRQ line status. 32 * IRQ line status.
33 * 33 *
34 * Bits 0-16 are reserved for the IRQF_* bits in linux/interrupt.h 34 * Bits 0-7 are reserved for the IRQF_* bits in linux/interrupt.h
35 * 35 *
36 * IRQ types 36 * IRQ types
37 */ 37 */
@@ -45,28 +45,30 @@ typedef void fastcall (*irq_flow_handler_t)(unsigned int irq,
45#define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ 45#define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */
46 46
47/* Internal flags */ 47/* Internal flags */
48#define IRQ_INPROGRESS 0x00010000 /* IRQ handler active - do not enter! */ 48#define IRQ_INPROGRESS 0x00000100 /* IRQ handler active - do not enter! */
49#define IRQ_DISABLED 0x00020000 /* IRQ disabled - do not enter! */ 49#define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */
50#define IRQ_PENDING 0x00040000 /* IRQ pending - replay on enable */ 50#define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */
51#define IRQ_REPLAY 0x00080000 /* IRQ has been replayed but not acked yet */ 51#define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */
52#define IRQ_AUTODETECT 0x00100000 /* IRQ is being autodetected */ 52#define IRQ_AUTODETECT 0x00001000 /* IRQ is being autodetected */
53#define IRQ_WAITING 0x00200000 /* IRQ not yet seen - for autodetection */ 53#define IRQ_WAITING 0x00002000 /* IRQ not yet seen - for autodetection */
54#define IRQ_LEVEL 0x00400000 /* IRQ level triggered */ 54#define IRQ_LEVEL 0x00004000 /* IRQ level triggered */
55#define IRQ_MASKED 0x00800000 /* IRQ masked - shouldn't be seen again */ 55#define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */
56#define IRQ_PER_CPU 0x01000000 /* IRQ is per CPU */ 56#define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */
57#define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */
58#define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */
59#define IRQ_NOAUTOEN 0x00080000 /* IRQ will not be enabled on request irq */
60#define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */
61#define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */
62#define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */
63
57#ifdef CONFIG_IRQ_PER_CPU 64#ifdef CONFIG_IRQ_PER_CPU
58# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) 65# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
66# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
59#else 67#else
60# define CHECK_IRQ_PER_CPU(var) 0 68# define CHECK_IRQ_PER_CPU(var) 0
69# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING
61#endif 70#endif
62 71
63#define IRQ_NOPROBE 0x02000000 /* IRQ is not valid for probing */
64#define IRQ_NOREQUEST 0x04000000 /* IRQ cannot be requested */
65#define IRQ_NOAUTOEN 0x08000000 /* IRQ will not be enabled on request irq */
66#define IRQ_DELAYED_DISABLE 0x10000000 /* IRQ disable (masking) happens delayed. */
67#define IRQ_WAKEUP 0x20000000 /* IRQ triggers system wakeup */
68#define IRQ_MOVE_PENDING 0x40000000 /* need to re-target IRQ destination */
69
70struct proc_dir_entry; 72struct proc_dir_entry;
71struct msi_desc; 73struct msi_desc;
72 74
@@ -127,6 +129,7 @@ struct irq_chip {
127 * 129 *
128 * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] 130 * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()]
129 * @chip: low level interrupt hardware access 131 * @chip: low level interrupt hardware access
132 * @msi_desc: MSI descriptor
130 * @handler_data: per-IRQ data for the irq_chip methods 133 * @handler_data: per-IRQ data for the irq_chip methods
131 * @chip_data: platform-specific per-chip private data for the chip 134 * @chip_data: platform-specific per-chip private data for the chip
132 * methods, to allow shared chip implementations 135 * methods, to allow shared chip implementations
@@ -235,11 +238,21 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
235 238
236#endif /* CONFIG_GENERIC_PENDING_IRQ */ 239#endif /* CONFIG_GENERIC_PENDING_IRQ */
237 240
241extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
242extern int irq_can_set_affinity(unsigned int irq);
243
238#else /* CONFIG_SMP */ 244#else /* CONFIG_SMP */
239 245
240#define move_native_irq(x) 246#define move_native_irq(x)
241#define move_masked_irq(x) 247#define move_masked_irq(x)
242 248
249static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
250{
251 return -EINVAL;
252}
253
254static inline int irq_can_set_affinity(unsigned int irq) { return 0; }
255
243#endif /* CONFIG_SMP */ 256#endif /* CONFIG_SMP */
244 257
245#ifdef CONFIG_IRQBALANCE 258#ifdef CONFIG_IRQBALANCE
@@ -261,6 +274,11 @@ static inline int select_smp_affinity(unsigned int irq)
261 274
262extern int no_irq_affinity; 275extern int no_irq_affinity;
263 276
277static inline int irq_balancing_disabled(unsigned int irq)
278{
279 return irq_desc[irq].status & IRQ_NO_BALANCING_MASK;
280}
281
264/* Handle irq action chains: */ 282/* Handle irq action chains: */
265extern int handle_IRQ_event(unsigned int irq, struct irqaction *action); 283extern int handle_IRQ_event(unsigned int irq, struct irqaction *action);
266 284
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 0ec6e28bccd2..c080f61fb024 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -142,13 +142,13 @@ static inline u64 get_jiffies_64(void)
142 * 142 *
143 * And some not so obvious. 143 * And some not so obvious.
144 * 144 *
145 * Note that we don't want to return MAX_LONG, because 145 * Note that we don't want to return LONG_MAX, because
146 * for various timeout reasons we often end up having 146 * for various timeout reasons we often end up having
147 * to wait "jiffies+1" in order to guarantee that we wait 147 * to wait "jiffies+1" in order to guarantee that we wait
148 * at _least_ "jiffies" - so "jiffies+1" had better still 148 * at _least_ "jiffies" - so "jiffies+1" had better still
149 * be positive. 149 * be positive.
150 */ 150 */
151#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) 151#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1)
152 152
153/* 153/*
154 * We want to do realistic conversions of time so we need to use the same 154 * We want to do realistic conversions of time so we need to use the same
@@ -259,207 +259,23 @@ static inline u64 get_jiffies_64(void)
259#endif 259#endif
260 260
261/* 261/*
262 * Convert jiffies to milliseconds and back. 262 * Convert various time units to each other:
263 *
264 * Avoid unnecessary multiplications/divisions in the
265 * two most common HZ cases:
266 */
267static inline unsigned int jiffies_to_msecs(const unsigned long j)
268{
269#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
270 return (MSEC_PER_SEC / HZ) * j;
271#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
272 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
273#else
274 return (j * MSEC_PER_SEC) / HZ;
275#endif
276}
277
278static inline unsigned int jiffies_to_usecs(const unsigned long j)
279{
280#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
281 return (USEC_PER_SEC / HZ) * j;
282#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
283 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
284#else
285 return (j * USEC_PER_SEC) / HZ;
286#endif
287}
288
289static inline unsigned long msecs_to_jiffies(const unsigned int m)
290{
291 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
292 return MAX_JIFFY_OFFSET;
293#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
294 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
295#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
296 return m * (HZ / MSEC_PER_SEC);
297#else
298 return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
299#endif
300}
301
302static inline unsigned long usecs_to_jiffies(const unsigned int u)
303{
304 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
305 return MAX_JIFFY_OFFSET;
306#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
307 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
308#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
309 return u * (HZ / USEC_PER_SEC);
310#else
311 return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
312#endif
313}
314
315/*
316 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
317 * that a remainder subtract here would not do the right thing as the
318 * resolution values don't fall on second boundries. I.e. the line:
319 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
320 *
321 * Rather, we just shift the bits off the right.
322 *
323 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
324 * value to a scaled second value.
325 */
326static __inline__ unsigned long
327timespec_to_jiffies(const struct timespec *value)
328{
329 unsigned long sec = value->tv_sec;
330 long nsec = value->tv_nsec + TICK_NSEC - 1;
331
332 if (sec >= MAX_SEC_IN_JIFFIES){
333 sec = MAX_SEC_IN_JIFFIES;
334 nsec = 0;
335 }
336 return (((u64)sec * SEC_CONVERSION) +
337 (((u64)nsec * NSEC_CONVERSION) >>
338 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
339
340}
341
342static __inline__ void
343jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
344{
345 /*
346 * Convert jiffies to nanoseconds and separate with
347 * one divide.
348 */
349 u64 nsec = (u64)jiffies * TICK_NSEC;
350 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
351}
352
353/* Same for "timeval"
354 *
355 * Well, almost. The problem here is that the real system resolution is
356 * in nanoseconds and the value being converted is in micro seconds.
357 * Also for some machines (those that use HZ = 1024, in-particular),
358 * there is a LARGE error in the tick size in microseconds.
359
360 * The solution we use is to do the rounding AFTER we convert the
361 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
362 * Instruction wise, this should cost only an additional add with carry
363 * instruction above the way it was done above.
364 */
365static __inline__ unsigned long
366timeval_to_jiffies(const struct timeval *value)
367{
368 unsigned long sec = value->tv_sec;
369 long usec = value->tv_usec;
370
371 if (sec >= MAX_SEC_IN_JIFFIES){
372 sec = MAX_SEC_IN_JIFFIES;
373 usec = 0;
374 }
375 return (((u64)sec * SEC_CONVERSION) +
376 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
377 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
378}
379
380static __inline__ void
381jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
382{
383 /*
384 * Convert jiffies to nanoseconds and separate with
385 * one divide.
386 */
387 u64 nsec = (u64)jiffies * TICK_NSEC;
388 long tv_usec;
389
390 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
391 tv_usec /= NSEC_PER_USEC;
392 value->tv_usec = tv_usec;
393}
394
395/*
396 * Convert jiffies/jiffies_64 to clock_t and back.
397 */ 263 */
398static inline clock_t jiffies_to_clock_t(long x) 264extern unsigned int jiffies_to_msecs(const unsigned long j);
399{ 265extern unsigned int jiffies_to_usecs(const unsigned long j);
400#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 266extern unsigned long msecs_to_jiffies(const unsigned int m);
401 return x / (HZ / USER_HZ); 267extern unsigned long usecs_to_jiffies(const unsigned int u);
402#else 268extern unsigned long timespec_to_jiffies(const struct timespec *value);
403 u64 tmp = (u64)x * TICK_NSEC; 269extern void jiffies_to_timespec(const unsigned long jiffies,
404 do_div(tmp, (NSEC_PER_SEC / USER_HZ)); 270 struct timespec *value);
405 return (long)tmp; 271extern unsigned long timeval_to_jiffies(const struct timeval *value);
406#endif 272extern void jiffies_to_timeval(const unsigned long jiffies,
407} 273 struct timeval *value);
408 274extern clock_t jiffies_to_clock_t(long x);
409static inline unsigned long clock_t_to_jiffies(unsigned long x) 275extern unsigned long clock_t_to_jiffies(unsigned long x);
410{ 276extern u64 jiffies_64_to_clock_t(u64 x);
411#if (HZ % USER_HZ)==0 277extern u64 nsec_to_clock_t(u64 x);
412 if (x >= ~0UL / (HZ / USER_HZ)) 278
413 return ~0UL; 279#define TIMESTAMP_SIZE 30
414 return x * (HZ / USER_HZ);
415#else
416 u64 jif;
417
418 /* Don't worry about loss of precision here .. */
419 if (x >= ~0UL / HZ * USER_HZ)
420 return ~0UL;
421
422 /* .. but do try to contain it here */
423 jif = x * (u64) HZ;
424 do_div(jif, USER_HZ);
425 return jif;
426#endif
427}
428
429static inline u64 jiffies_64_to_clock_t(u64 x)
430{
431#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
432 do_div(x, HZ / USER_HZ);
433#else
434 /*
435 * There are better ways that don't overflow early,
436 * but even this doesn't overflow in hundreds of years
437 * in 64 bits, so..
438 */
439 x *= TICK_NSEC;
440 do_div(x, (NSEC_PER_SEC / USER_HZ));
441#endif
442 return x;
443}
444
445static inline u64 nsec_to_clock_t(u64 x)
446{
447#if (NSEC_PER_SEC % USER_HZ) == 0
448 do_div(x, (NSEC_PER_SEC / USER_HZ));
449#elif (USER_HZ % 512) == 0
450 x *= USER_HZ/512;
451 do_div(x, (NSEC_PER_SEC / 512));
452#else
453 /*
454 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
455 * overflow after 64.99 years.
456 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
457 */
458 x *= 9;
459 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
460 / USER_HZ));
461#endif
462 return x;
463}
464 280
465#endif 281#endif
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 7444a6326231..c68c7ac6b232 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -261,8 +261,7 @@ static inline s64 ktime_to_ns(const ktime_t kt)
261 * idea of the (in)accuracy of timers. Timer values are rounded up to 261 * idea of the (in)accuracy of timers. Timer values are rounded up to
262 * this resolution values. 262 * this resolution values.
263 */ 263 */
264#define KTIME_REALTIME_RES (ktime_t){ .tv64 = TICK_NSEC } 264#define KTIME_LOW_RES (ktime_t){ .tv64 = TICK_NSEC }
265#define KTIME_MONOTONIC_RES (ktime_t){ .tv64 = TICK_NSEC }
266 265
267/* Get the monotonic time in timespec format: */ 266/* Get the monotonic time in timespec format: */
268extern void ktime_get_ts(struct timespec *ts); 267extern void ktime_get_ts(struct timespec *ts);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index db05182ca0e8..1be5be88debe 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -105,12 +105,11 @@ struct nfs4_ace {
105 uint32_t access_mask; 105 uint32_t access_mask;
106 int whotype; 106 int whotype;
107 uid_t who; 107 uid_t who;
108 struct list_head l_ace;
109}; 108};
110 109
111struct nfs4_acl { 110struct nfs4_acl {
112 uint32_t naces; 111 uint32_t naces;
113 struct list_head ace_head; 112 struct nfs4_ace aces[0];
114}; 113};
115 114
116typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; 115typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h
index 22aff4d01f20..409b6e02f337 100644
--- a/include/linux/nfs4_acl.h
+++ b/include/linux/nfs4_acl.h
@@ -39,9 +39,12 @@
39 39
40#include <linux/posix_acl.h> 40#include <linux/posix_acl.h>
41 41
42struct nfs4_acl *nfs4_acl_new(void); 42/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
43void nfs4_acl_free(struct nfs4_acl *); 43 * fit in a page: */
44int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); 44#define NFS4_ACL_MAX 170
45
46struct nfs4_acl *nfs4_acl_new(int);
47void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
45int nfs4_acl_get_whotype(char *, u32); 48int nfs4_acl_get_whotype(char *, u32);
46int nfs4_acl_write_who(int who, char *p); 49int nfs4_acl_write_who(int who, char *p);
47int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, 50int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
diff --git a/include/linux/tick.h b/include/linux/tick.h
new file mode 100644
index 000000000000..9a7252e089b9
--- /dev/null
+++ b/include/linux/tick.h
@@ -0,0 +1,109 @@
1/* linux/include/linux/tick.h
2 *
3 * This file contains the structure definitions for tick related functions
4 *
5 */
6#ifndef _LINUX_TICK_H
7#define _LINUX_TICK_H
8
9#include <linux/clockchips.h>
10
11#ifdef CONFIG_GENERIC_CLOCKEVENTS
12
13enum tick_device_mode {
14 TICKDEV_MODE_PERIODIC,
15 TICKDEV_MODE_ONESHOT,
16};
17
18struct tick_device {
19 struct clock_event_device *evtdev;
20 enum tick_device_mode mode;
21};
22
23enum tick_nohz_mode {
24 NOHZ_MODE_INACTIVE,
25 NOHZ_MODE_LOWRES,
26 NOHZ_MODE_HIGHRES,
27};
28
29/**
30 * struct tick_sched - sched tick emulation and no idle tick control/stats
31 * @sched_timer: hrtimer to schedule the periodic tick in high
32 * resolution mode
33 * @idle_tick: Store the last idle tick expiry time when the tick
34 * timer is modified for idle sleeps. This is necessary
35 * to resume the tick timer operation in the timeline
36 * when the CPU returns from idle
37 * @tick_stopped: Indicator that the idle tick has been stopped
38 * @idle_jiffies: jiffies at the entry to idle for idle time accounting
39 * @idle_calls: Total number of idle calls
40 * @idle_sleeps: Number of idle calls, where the sched tick was stopped
41 * @idle_entrytime: Time when the idle call was entered
42 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
43 */
44struct tick_sched {
45 struct hrtimer sched_timer;
46 unsigned long check_clocks;
47 enum tick_nohz_mode nohz_mode;
48 ktime_t idle_tick;
49 int tick_stopped;
50 unsigned long idle_jiffies;
51 unsigned long idle_calls;
52 unsigned long idle_sleeps;
53 ktime_t idle_entrytime;
54 ktime_t idle_sleeptime;
55 unsigned long last_jiffies;
56 unsigned long next_jiffies;
57 ktime_t idle_expires;
58};
59
60extern void __init tick_init(void);
61extern int tick_is_oneshot_available(void);
62extern struct tick_device *tick_get_device(int cpu);
63
64# ifdef CONFIG_HIGH_RES_TIMERS
65extern int tick_init_highres(void);
66extern int tick_program_event(ktime_t expires, int force);
67extern void tick_setup_sched_timer(void);
68extern void tick_cancel_sched_timer(int cpu);
69# else
70static inline void tick_cancel_sched_timer(int cpu) { }
71# endif /* HIGHRES */
72
73# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
74extern struct tick_device *tick_get_broadcast_device(void);
75extern cpumask_t *tick_get_broadcast_mask(void);
76
77# ifdef CONFIG_TICK_ONESHOT
78extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
79# endif
80
81# endif /* BROADCAST */
82
83# ifdef CONFIG_TICK_ONESHOT
84extern void tick_clock_notify(void);
85extern int tick_check_oneshot_change(int allow_nohz);
86extern struct tick_sched *tick_get_tick_sched(int cpu);
87# else
88static inline void tick_clock_notify(void) { }
89static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
90# endif
91
92#else /* CONFIG_GENERIC_CLOCKEVENTS */
93static inline void tick_init(void) { }
94static inline void tick_cancel_sched_timer(int cpu) { }
95static inline void tick_clock_notify(void) { }
96static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
97#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
98
99# ifdef CONFIG_NO_HZ
100extern void tick_nohz_stop_sched_tick(void);
101extern void tick_nohz_restart_sched_tick(void);
102extern void tick_nohz_update_jiffies(void);
103# else
104static inline void tick_nohz_stop_sched_tick(void) { }
105static inline void tick_nohz_restart_sched_tick(void) { }
106static inline void tick_nohz_update_jiffies(void) { }
107# endif /* !NO_HZ */
108
109#endif
diff --git a/include/linux/time.h b/include/linux/time.h
index eceb1a59b078..8ea8dea713c7 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -92,6 +92,7 @@ extern struct timespec xtime;
92extern struct timespec wall_to_monotonic; 92extern struct timespec wall_to_monotonic;
93extern seqlock_t xtime_lock __attribute__((weak)); 93extern seqlock_t xtime_lock __attribute__((weak));
94 94
95extern unsigned long read_persistent_clock(void);
95void timekeeping_init(void); 96void timekeeping_init(void);
96 97
97static inline unsigned long get_seconds(void) 98static inline unsigned long get_seconds(void)
diff --git a/include/linux/timer.h b/include/linux/timer.h
index fb5edaaf0ebd..719113b652dd 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -2,6 +2,7 @@
2#define _LINUX_TIMER_H 2#define _LINUX_TIMER_H
3 3
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/ktime.h>
5#include <linux/spinlock.h> 6#include <linux/spinlock.h>
6#include <linux/stddef.h> 7#include <linux/stddef.h>
7 8
@@ -15,6 +16,11 @@ struct timer_list {
15 unsigned long data; 16 unsigned long data;
16 17
17 struct tvec_t_base_s *base; 18 struct tvec_t_base_s *base;
19#ifdef CONFIG_TIMER_STATS
20 void *start_site;
21 char start_comm[16];
22 int start_pid;
23#endif
18}; 24};
19 25
20extern struct tvec_t_base_s boot_tvec_bases; 26extern struct tvec_t_base_s boot_tvec_bases;
@@ -61,7 +67,65 @@ extern int del_timer(struct timer_list * timer);
61extern int __mod_timer(struct timer_list *timer, unsigned long expires); 67extern int __mod_timer(struct timer_list *timer, unsigned long expires);
62extern int mod_timer(struct timer_list *timer, unsigned long expires); 68extern int mod_timer(struct timer_list *timer, unsigned long expires);
63 69
70/*
71 * Return when the next timer-wheel timeout occurs (in absolute jiffies),
72 * locks the timer base:
73 */
64extern unsigned long next_timer_interrupt(void); 74extern unsigned long next_timer_interrupt(void);
75/*
76 * Return when the next timer-wheel timeout occurs (in absolute jiffies),
77 * locks the timer base and does the comparison against the given
78 * jiffie.
79 */
80extern unsigned long get_next_timer_interrupt(unsigned long now);
81
82/*
83 * Timer-statistics info:
84 */
85#ifdef CONFIG_TIMER_STATS
86
87extern void init_timer_stats(void);
88
89extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
90 void *timerf, char * comm);
91
92static inline void timer_stats_account_timer(struct timer_list *timer)
93{
94 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
95 timer->function, timer->start_comm);
96}
97
98extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
99 void *addr);
100
101static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
102{
103 __timer_stats_timer_set_start_info(timer, __builtin_return_address(0));
104}
105
106static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
107{
108 timer->start_site = NULL;
109}
110#else
111static inline void init_timer_stats(void)
112{
113}
114
115static inline void timer_stats_account_timer(struct timer_list *timer)
116{
117}
118
119static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
120{
121}
122
123static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
124{
125}
126#endif
127
128extern void delayed_work_timer_fn(unsigned long __data);
65 129
66/** 130/**
67 * add_timer - start a timer 131 * add_timer - start a timer
@@ -96,7 +160,7 @@ static inline void add_timer(struct timer_list *timer)
96extern void init_timers(void); 160extern void init_timers(void);
97extern void run_local_timers(void); 161extern void run_local_timers(void);
98struct hrtimer; 162struct hrtimer;
99extern int it_real_fn(struct hrtimer *); 163extern enum hrtimer_restart it_real_fn(struct hrtimer *);
100 164
101unsigned long __round_jiffies(unsigned long j, int cpu); 165unsigned long __round_jiffies(unsigned long j, int cpu);
102unsigned long __round_jiffies_relative(unsigned long j, int cpu); 166unsigned long __round_jiffies_relative(unsigned long j, int cpu);
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 9a24e500c311..da929dbbea2a 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -286,6 +286,13 @@ static inline void time_interpolator_update(long delta_nsec)
286 286
287#define TICK_LENGTH_SHIFT 32 287#define TICK_LENGTH_SHIFT 32
288 288
289#ifdef CONFIG_NO_HZ
290#define NTP_INTERVAL_FREQ (2)
291#else
292#define NTP_INTERVAL_FREQ (HZ)
293#endif
294#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
295
289/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ 296/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
290extern u64 current_tick_length(void); 297extern u64 current_tick_length(void);
291 298
diff --git a/init/main.c b/init/main.c
index 2421e1544127..953500b02ac4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -40,6 +40,7 @@
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/cpuset.h> 41#include <linux/cpuset.h>
42#include <linux/efi.h> 42#include <linux/efi.h>
43#include <linux/tick.h>
43#include <linux/taskstats_kern.h> 44#include <linux/taskstats_kern.h>
44#include <linux/delayacct.h> 45#include <linux/delayacct.h>
45#include <linux/unistd.h> 46#include <linux/unistd.h>
@@ -515,6 +516,7 @@ asmlinkage void __init start_kernel(void)
515 * enable them 516 * enable them
516 */ 517 */
517 lock_kernel(); 518 lock_kernel();
519 tick_init();
518 boot_cpu_init(); 520 boot_cpu_init();
519 page_address_init(); 521 page_address_init();
520 printk(KERN_NOTICE); 522 printk(KERN_NOTICE);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b6293d94d96..d154cc786489 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
858 init_sigpending(&sig->shared_pending); 858 init_sigpending(&sig->shared_pending);
859 INIT_LIST_HEAD(&sig->posix_timers); 859 INIT_LIST_HEAD(&sig->posix_timers);
860 860
861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
862 sig->it_real_incr.tv64 = 0; 862 sig->it_real_incr.tv64 = 0;
863 sig->real_timer.function = it_real_fn; 863 sig->real_timer.function = it_real_fn;
864 sig->tsk = tsk; 864 sig->tsk = tsk;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a737de857d3..e749e7df14b1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1134 1134
1135 if (sec != MAX_SCHEDULE_TIMEOUT) { 1135 if (sec != MAX_SCHEDULE_TIMEOUT) {
1136 to = &timeout; 1136 to = &timeout;
1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); 1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1138 hrtimer_init_sleeper(to, current); 1138 hrtimer_init_sleeper(to, current);
1139 to->timer.expires = ktime_set(sec, nsec); 1139 to->timer.expires = ktime_set(sec, nsec);
1140 } 1140 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f44e499e8fca..476cb0c0b4a4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * linux/kernel/hrtimer.c 2 * linux/kernel/hrtimer.c
3 * 3 *
4 * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
6 * 7 *
7 * High-resolution kernel timers 8 * High-resolution kernel timers
8 * 9 *
@@ -31,12 +32,17 @@
31 */ 32 */
32 33
33#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
34#include <linux/module.h> 36#include <linux/module.h>
35#include <linux/percpu.h> 37#include <linux/percpu.h>
36#include <linux/hrtimer.h> 38#include <linux/hrtimer.h>
37#include <linux/notifier.h> 39#include <linux/notifier.h>
38#include <linux/syscalls.h> 40#include <linux/syscalls.h>
41#include <linux/kallsyms.h>
39#include <linux/interrupt.h> 42#include <linux/interrupt.h>
43#include <linux/tick.h>
44#include <linux/seq_file.h>
45#include <linux/err.h>
40 46
41#include <asm/uaccess.h> 47#include <asm/uaccess.h>
42 48
@@ -45,7 +51,7 @@
45 * 51 *
46 * returns the time in ktime_t format 52 * returns the time in ktime_t format
47 */ 53 */
48static ktime_t ktime_get(void) 54ktime_t ktime_get(void)
49{ 55{
50 struct timespec now; 56 struct timespec now;
51 57
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void)
59 * 65 *
60 * returns the time in ktime_t format 66 * returns the time in ktime_t format
61 */ 67 */
62static ktime_t ktime_get_real(void) 68ktime_t ktime_get_real(void)
63{ 69{
64 struct timespec now; 70 struct timespec now;
65 71
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
79 * This ensures that we capture erroneous accesses to these clock ids 85 * This ensures that we capture erroneous accesses to these clock ids
80 * rather than moving them into the range of valid clock id's. 86 * rather than moving them into the range of valid clock id's.
81 */ 87 */
82 88DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
83#define MAX_HRTIMER_BASES 2
84
85static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
86{ 89{
90
91 .clock_base =
87 { 92 {
88 .index = CLOCK_REALTIME, 93 {
89 .get_time = &ktime_get_real, 94 .index = CLOCK_REALTIME,
90 .resolution = KTIME_REALTIME_RES, 95 .get_time = &ktime_get_real,
91 }, 96 .resolution = KTIME_LOW_RES,
92 { 97 },
93 .index = CLOCK_MONOTONIC, 98 {
94 .get_time = &ktime_get, 99 .index = CLOCK_MONOTONIC,
95 .resolution = KTIME_MONOTONIC_RES, 100 .get_time = &ktime_get,
96 }, 101 .resolution = KTIME_LOW_RES,
102 },
103 }
97}; 104};
98 105
99/** 106/**
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
125 * Get the coarse grained time at the softirq based on xtime and 132 * Get the coarse grained time at the softirq based on xtime and
126 * wall_to_monotonic. 133 * wall_to_monotonic.
127 */ 134 */
128static void hrtimer_get_softirq_time(struct hrtimer_base *base) 135static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
129{ 136{
130 ktime_t xtim, tomono; 137 ktime_t xtim, tomono;
138 struct timespec xts;
131 unsigned long seq; 139 unsigned long seq;
132 140
133 do { 141 do {
134 seq = read_seqbegin(&xtime_lock); 142 seq = read_seqbegin(&xtime_lock);
135 xtim = timespec_to_ktime(xtime); 143#ifdef CONFIG_NO_HZ
136 tomono = timespec_to_ktime(wall_to_monotonic); 144 getnstimeofday(&xts);
137 145#else
146 xts = xtime;
147#endif
138 } while (read_seqretry(&xtime_lock, seq)); 148 } while (read_seqretry(&xtime_lock, seq));
139 149
140 base[CLOCK_REALTIME].softirq_time = xtim; 150 xtim = timespec_to_ktime(xts);
141 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); 151 tomono = timespec_to_ktime(wall_to_monotonic);
152 base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
153 base->clock_base[CLOCK_MONOTONIC].softirq_time =
154 ktime_add(xtim, tomono);
155}
156
157/*
158 * Helper function to check, whether the timer is running the callback
159 * function
160 */
161static inline int hrtimer_callback_running(struct hrtimer *timer)
162{
163 return timer->state & HRTIMER_STATE_CALLBACK;
142} 164}
143 165
144/* 166/*
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
147 */ 169 */
148#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
149 171
150#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
151
152/* 172/*
153 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock 173 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
154 * means that all timers which are tied to this base via timer->base are 174 * means that all timers which are tied to this base via timer->base are
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
161 * possible to set timer->base = NULL and drop the lock: the timer remains 181 * possible to set timer->base = NULL and drop the lock: the timer remains
162 * locked. 182 * locked.
163 */ 183 */
164static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, 184static
165 unsigned long *flags) 185struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
186 unsigned long *flags)
166{ 187{
167 struct hrtimer_base *base; 188 struct hrtimer_clock_base *base;
168 189
169 for (;;) { 190 for (;;) {
170 base = timer->base; 191 base = timer->base;
171 if (likely(base != NULL)) { 192 if (likely(base != NULL)) {
172 spin_lock_irqsave(&base->lock, *flags); 193 spin_lock_irqsave(&base->cpu_base->lock, *flags);
173 if (likely(base == timer->base)) 194 if (likely(base == timer->base))
174 return base; 195 return base;
175 /* The timer has migrated to another CPU: */ 196 /* The timer has migrated to another CPU: */
176 spin_unlock_irqrestore(&base->lock, *flags); 197 spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
177 } 198 }
178 cpu_relax(); 199 cpu_relax();
179 } 200 }
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
182/* 203/*
183 * Switch the timer base to the current CPU when possible. 204 * Switch the timer base to the current CPU when possible.
184 */ 205 */
185static inline struct hrtimer_base * 206static inline struct hrtimer_clock_base *
186switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) 207switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
187{ 208{
188 struct hrtimer_base *new_base; 209 struct hrtimer_clock_base *new_base;
210 struct hrtimer_cpu_base *new_cpu_base;
189 211
190 new_base = &__get_cpu_var(hrtimer_bases)[base->index]; 212 new_cpu_base = &__get_cpu_var(hrtimer_bases);
213 new_base = &new_cpu_base->clock_base[base->index];
191 214
192 if (base != new_base) { 215 if (base != new_base) {
193 /* 216 /*
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
199 * completed. There is no conflict as we hold the lock until 222 * completed. There is no conflict as we hold the lock until
200 * the timer is enqueued. 223 * the timer is enqueued.
201 */ 224 */
202 if (unlikely(base->curr_timer == timer)) 225 if (unlikely(hrtimer_callback_running(timer)))
203 return base; 226 return base;
204 227
205 /* See the comment in lock_timer_base() */ 228 /* See the comment in lock_timer_base() */
206 timer->base = NULL; 229 timer->base = NULL;
207 spin_unlock(&base->lock); 230 spin_unlock(&base->cpu_base->lock);
208 spin_lock(&new_base->lock); 231 spin_lock(&new_base->cpu_base->lock);
209 timer->base = new_base; 232 timer->base = new_base;
210 } 233 }
211 return new_base; 234 return new_base;
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
213 236
214#else /* CONFIG_SMP */ 237#else /* CONFIG_SMP */
215 238
216#define set_curr_timer(b, t) do { } while (0) 239static inline struct hrtimer_clock_base *
217
218static inline struct hrtimer_base *
219lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 240lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
220{ 241{
221 struct hrtimer_base *base = timer->base; 242 struct hrtimer_clock_base *base = timer->base;
222 243
223 spin_lock_irqsave(&base->lock, *flags); 244 spin_lock_irqsave(&base->cpu_base->lock, *flags);
224 245
225 return base; 246 return base;
226} 247}
227 248
228#define switch_hrtimer_base(t, b) (b) 249# define switch_hrtimer_base(t, b) (b)
229 250
230#endif /* !CONFIG_SMP */ 251#endif /* !CONFIG_SMP */
231 252
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
256 277
257 return ktime_add(kt, tmp); 278 return ktime_add(kt, tmp);
258} 279}
259
260#else /* CONFIG_KTIME_SCALAR */
261
262# endif /* !CONFIG_KTIME_SCALAR */ 280# endif /* !CONFIG_KTIME_SCALAR */
263 281
264/* 282/*
265 * Divide a ktime value by a nanosecond value 283 * Divide a ktime value by a nanosecond value
266 */ 284 */
267static unsigned long ktime_divns(const ktime_t kt, s64 div) 285unsigned long ktime_divns(const ktime_t kt, s64 div)
268{ 286{
269 u64 dclc, inc, dns; 287 u64 dclc, inc, dns;
270 int sft = 0; 288 int sft = 0;
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div)
281 299
282 return (unsigned long) dclc; 300 return (unsigned long) dclc;
283} 301}
284
285#else /* BITS_PER_LONG < 64 */
286# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
287#endif /* BITS_PER_LONG >= 64 */ 302#endif /* BITS_PER_LONG >= 64 */
288 303
304/* High resolution timer related functions */
305#ifdef CONFIG_HIGH_RES_TIMERS
306
307/*
308 * High resolution timer enabled ?
309 */
310static int hrtimer_hres_enabled __read_mostly = 1;
311
312/*
313 * Enable / Disable high resolution mode
314 */
315static int __init setup_hrtimer_hres(char *str)
316{
317 if (!strcmp(str, "off"))
318 hrtimer_hres_enabled = 0;
319 else if (!strcmp(str, "on"))
320 hrtimer_hres_enabled = 1;
321 else
322 return 0;
323 return 1;
324}
325
326__setup("highres=", setup_hrtimer_hres);
327
328/*
329 * hrtimer_high_res_enabled - query, if the highres mode is enabled
330 */
331static inline int hrtimer_is_hres_enabled(void)
332{
333 return hrtimer_hres_enabled;
334}
335
336/*
337 * Is the high resolution mode active ?
338 */
339static inline int hrtimer_hres_active(void)
340{
341 return __get_cpu_var(hrtimer_bases).hres_active;
342}
343
344/*
345 * Reprogram the event source with checking both queues for the
346 * next event
347 * Called with interrupts disabled and base->lock held
348 */
349static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
350{
351 int i;
352 struct hrtimer_clock_base *base = cpu_base->clock_base;
353 ktime_t expires;
354
355 cpu_base->expires_next.tv64 = KTIME_MAX;
356
357 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
358 struct hrtimer *timer;
359
360 if (!base->first)
361 continue;
362 timer = rb_entry(base->first, struct hrtimer, node);
363 expires = ktime_sub(timer->expires, base->offset);
364 if (expires.tv64 < cpu_base->expires_next.tv64)
365 cpu_base->expires_next = expires;
366 }
367
368 if (cpu_base->expires_next.tv64 != KTIME_MAX)
369 tick_program_event(cpu_base->expires_next, 1);
370}
371
372/*
373 * Shared reprogramming for clock_realtime and clock_monotonic
374 *
375 * When a timer is enqueued and expires earlier than the already enqueued
376 * timers, we have to check, whether it expires earlier than the timer for
377 * which the clock event device was armed.
378 *
379 * Called with interrupts disabled and base->cpu_base.lock held
380 */
381static int hrtimer_reprogram(struct hrtimer *timer,
382 struct hrtimer_clock_base *base)
383{
384 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
385 ktime_t expires = ktime_sub(timer->expires, base->offset);
386 int res;
387
388 /*
389 * When the callback is running, we do not reprogram the clock event
390 * device. The timer callback is either running on a different CPU or
391 * the callback is executed in the hrtimer_interupt context. The
392 * reprogramming is handled either by the softirq, which called the
393 * callback or at the end of the hrtimer_interrupt.
394 */
395 if (hrtimer_callback_running(timer))
396 return 0;
397
398 if (expires.tv64 >= expires_next->tv64)
399 return 0;
400
401 /*
402 * Clockevents returns -ETIME, when the event was in the past.
403 */
404 res = tick_program_event(expires, 0);
405 if (!IS_ERR_VALUE(res))
406 *expires_next = expires;
407 return res;
408}
409
410
411/*
412 * Retrigger next event is called after clock was set
413 *
414 * Called with interrupts disabled via on_each_cpu()
415 */
416static void retrigger_next_event(void *arg)
417{
418 struct hrtimer_cpu_base *base;
419 struct timespec realtime_offset;
420 unsigned long seq;
421
422 if (!hrtimer_hres_active())
423 return;
424
425 do {
426 seq = read_seqbegin(&xtime_lock);
427 set_normalized_timespec(&realtime_offset,
428 -wall_to_monotonic.tv_sec,
429 -wall_to_monotonic.tv_nsec);
430 } while (read_seqretry(&xtime_lock, seq));
431
432 base = &__get_cpu_var(hrtimer_bases);
433
434 /* Adjust CLOCK_REALTIME offset */
435 spin_lock(&base->lock);
436 base->clock_base[CLOCK_REALTIME].offset =
437 timespec_to_ktime(realtime_offset);
438
439 hrtimer_force_reprogram(base);
440 spin_unlock(&base->lock);
441}
442
443/*
444 * Clock realtime was set
445 *
446 * Change the offset of the realtime clock vs. the monotonic
447 * clock.
448 *
449 * We might have to reprogram the high resolution timer interrupt. On
450 * SMP we call the architecture specific code to retrigger _all_ high
451 * resolution timer interrupts. On UP we just disable interrupts and
452 * call the high resolution interrupt code.
453 */
454void clock_was_set(void)
455{
456 /* Retrigger the CPU local events everywhere */
457 on_each_cpu(retrigger_next_event, NULL, 0, 1);
458}
459
460/*
461 * Check, whether the timer is on the callback pending list
462 */
463static inline int hrtimer_cb_pending(const struct hrtimer *timer)
464{
465 return timer->state & HRTIMER_STATE_PENDING;
466}
467
468/*
469 * Remove a timer from the callback pending list
470 */
471static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
472{
473 list_del_init(&timer->cb_entry);
474}
475
476/*
477 * Initialize the high resolution related parts of cpu_base
478 */
479static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
480{
481 base->expires_next.tv64 = KTIME_MAX;
482 base->hres_active = 0;
483 INIT_LIST_HEAD(&base->cb_pending);
484}
485
486/*
487 * Initialize the high resolution related parts of a hrtimer
488 */
489static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
490{
491 INIT_LIST_HEAD(&timer->cb_entry);
492}
493
494/*
495 * When High resolution timers are active, try to reprogram. Note, that in case
496 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
497 * check happens. The timer gets enqueued into the rbtree. The reprogramming
498 * and expiry check is done in the hrtimer_interrupt or in the softirq.
499 */
500static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
501 struct hrtimer_clock_base *base)
502{
503 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
504
505 /* Timer is expired, act upon the callback mode */
506 switch(timer->cb_mode) {
507 case HRTIMER_CB_IRQSAFE_NO_RESTART:
508 /*
509 * We can call the callback from here. No restart
510 * happens, so no danger of recursion
511 */
512 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
513 return 1;
514 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
515 /*
516 * This is solely for the sched tick emulation with
517 * dynamic tick support to ensure that we do not
518 * restart the tick right on the edge and end up with
519 * the tick timer in the softirq ! The calling site
520 * takes care of this.
521 */
522 return 1;
523 case HRTIMER_CB_IRQSAFE:
524 case HRTIMER_CB_SOFTIRQ:
525 /*
526 * Move everything else into the softirq pending list !
527 */
528 list_add_tail(&timer->cb_entry,
529 &base->cpu_base->cb_pending);
530 timer->state = HRTIMER_STATE_PENDING;
531 raise_softirq(HRTIMER_SOFTIRQ);
532 return 1;
533 default:
534 BUG();
535 }
536 }
537 return 0;
538}
539
540/*
541 * Switch to high resolution mode
542 */
543static void hrtimer_switch_to_hres(void)
544{
545 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
546 unsigned long flags;
547
548 if (base->hres_active)
549 return;
550
551 local_irq_save(flags);
552
553 if (tick_init_highres()) {
554 local_irq_restore(flags);
555 return;
556 }
557 base->hres_active = 1;
558 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
559 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
560
561 tick_setup_sched_timer();
562
563 /* "Retrigger" the interrupt to get things going */
564 retrigger_next_event(NULL);
565 local_irq_restore(flags);
566 printk(KERN_INFO "Switched to high resolution mode on CPU %d\n",
567 smp_processor_id());
568}
569
570#else
571
572static inline int hrtimer_hres_active(void) { return 0; }
573static inline int hrtimer_is_hres_enabled(void) { return 0; }
574static inline void hrtimer_switch_to_hres(void) { }
575static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
576static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
577 struct hrtimer_clock_base *base)
578{
579 return 0;
580}
581static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
582static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
583static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
584static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
585
586#endif /* CONFIG_HIGH_RES_TIMERS */
587
588#ifdef CONFIG_TIMER_STATS
589void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
590{
591 if (timer->start_site)
592 return;
593
594 timer->start_site = addr;
595 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
596 timer->start_pid = current->pid;
597}
598#endif
599
289/* 600/*
290 * Counterpart to lock_timer_base above: 601 * Counterpart to lock_timer_base above:
291 */ 602 */
292static inline 603static inline
293void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 604void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
294{ 605{
295 spin_unlock_irqrestore(&timer->base->lock, *flags); 606 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
296} 607}
297 608
298/** 609/**
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
342 * The timer is inserted in expiry order. Insertion into the 653 * The timer is inserted in expiry order. Insertion into the
343 * red black tree is O(log(n)). Must hold the base lock. 654 * red black tree is O(log(n)). Must hold the base lock.
344 */ 655 */
345static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 656static void enqueue_hrtimer(struct hrtimer *timer,
657 struct hrtimer_clock_base *base, int reprogram)
346{ 658{
347 struct rb_node **link = &base->active.rb_node; 659 struct rb_node **link = &base->active.rb_node;
348 struct rb_node *parent = NULL; 660 struct rb_node *parent = NULL;
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
368 * Insert the timer to the rbtree and check whether it 680 * Insert the timer to the rbtree and check whether it
369 * replaces the first pending timer 681 * replaces the first pending timer
370 */ 682 */
371 rb_link_node(&timer->node, parent, link);
372 rb_insert_color(&timer->node, &base->active);
373
374 if (!base->first || timer->expires.tv64 < 683 if (!base->first || timer->expires.tv64 <
375 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 684 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
685 /*
686 * Reprogram the clock event device. When the timer is already
687 * expired hrtimer_enqueue_reprogram has either called the
688 * callback or added it to the pending list and raised the
689 * softirq.
690 *
691 * This is a NOP for !HIGHRES
692 */
693 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
694 return;
695
376 base->first = &timer->node; 696 base->first = &timer->node;
697 }
698
699 rb_link_node(&timer->node, parent, link);
700 rb_insert_color(&timer->node, &base->active);
701 /*
702 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
703 * state of a possibly running callback.
704 */
705 timer->state |= HRTIMER_STATE_ENQUEUED;
377} 706}
378 707
379/* 708/*
380 * __remove_hrtimer - internal function to remove a timer 709 * __remove_hrtimer - internal function to remove a timer
381 * 710 *
382 * Caller must hold the base lock. 711 * Caller must hold the base lock.
712 *
713 * High resolution timer mode reprograms the clock event device when the
714 * timer is the one which expires next. The caller can disable this by setting
715 * reprogram to zero. This is useful, when the context does a reprogramming
716 * anyway (e.g. timer interrupt)
383 */ 717 */
384static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 718static void __remove_hrtimer(struct hrtimer *timer,
719 struct hrtimer_clock_base *base,
720 unsigned long newstate, int reprogram)
385{ 721{
386 /* 722 /* High res. callback list. NOP for !HIGHRES */
387 * Remove the timer from the rbtree and replace the 723 if (hrtimer_cb_pending(timer))
388 * first entry pointer if necessary. 724 hrtimer_remove_cb_pending(timer);
389 */ 725 else {
390 if (base->first == &timer->node) 726 /*
391 base->first = rb_next(&timer->node); 727 * Remove the timer from the rbtree and replace the
392 rb_erase(&timer->node, &base->active); 728 * first entry pointer if necessary.
393 rb_set_parent(&timer->node, &timer->node); 729 */
730 if (base->first == &timer->node) {
731 base->first = rb_next(&timer->node);
732 /* Reprogram the clock event device. if enabled */
733 if (reprogram && hrtimer_hres_active())
734 hrtimer_force_reprogram(base->cpu_base);
735 }
736 rb_erase(&timer->node, &base->active);
737 }
738 timer->state = newstate;
394} 739}
395 740
396/* 741/*
397 * remove hrtimer, called with base lock held 742 * remove hrtimer, called with base lock held
398 */ 743 */
399static inline int 744static inline int
400remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 745remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
401{ 746{
402 if (hrtimer_active(timer)) { 747 if (hrtimer_is_queued(timer)) {
403 __remove_hrtimer(timer, base); 748 int reprogram;
749
750 /*
751 * Remove the timer and force reprogramming when high
752 * resolution mode is active and the timer is on the current
753 * CPU. If we remove a timer on another CPU, reprogramming is
754 * skipped. The interrupt event on this CPU is fired and
755 * reprogramming happens in the interrupt handler. This is a
756 * rare case and less expensive than a smp call.
757 */
758 timer_stats_hrtimer_clear_start_info(timer);
759 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
760 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
761 reprogram);
404 return 1; 762 return 1;
405 } 763 }
406 return 0; 764 return 0;
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
419int 777int
420hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 778hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
421{ 779{
422 struct hrtimer_base *base, *new_base; 780 struct hrtimer_clock_base *base, *new_base;
423 unsigned long flags; 781 unsigned long flags;
424 int ret; 782 int ret;
425 783
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
431 /* Switch the timer base, if necessary: */ 789 /* Switch the timer base, if necessary: */
432 new_base = switch_hrtimer_base(timer, base); 790 new_base = switch_hrtimer_base(timer, base);
433 791
434 if (mode == HRTIMER_REL) { 792 if (mode == HRTIMER_MODE_REL) {
435 tim = ktime_add(tim, new_base->get_time()); 793 tim = ktime_add(tim, new_base->get_time());
436 /* 794 /*
437 * CONFIG_TIME_LOW_RES is a temporary way for architectures 795 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
446 } 804 }
447 timer->expires = tim; 805 timer->expires = tim;
448 806
449 enqueue_hrtimer(timer, new_base); 807 timer_stats_hrtimer_set_start_info(timer);
808
809 enqueue_hrtimer(timer, new_base, base == new_base);
450 810
451 unlock_hrtimer_base(timer, &flags); 811 unlock_hrtimer_base(timer, &flags);
452 812
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
466 */ 826 */
467int hrtimer_try_to_cancel(struct hrtimer *timer) 827int hrtimer_try_to_cancel(struct hrtimer *timer)
468{ 828{
469 struct hrtimer_base *base; 829 struct hrtimer_clock_base *base;
470 unsigned long flags; 830 unsigned long flags;
471 int ret = -1; 831 int ret = -1;
472 832
473 base = lock_hrtimer_base(timer, &flags); 833 base = lock_hrtimer_base(timer, &flags);
474 834
475 if (base->curr_timer != timer) 835 if (!hrtimer_callback_running(timer))
476 ret = remove_hrtimer(timer, base); 836 ret = remove_hrtimer(timer, base);
477 837
478 unlock_hrtimer_base(timer, &flags); 838 unlock_hrtimer_base(timer, &flags);
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
508 */ 868 */
509ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 869ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
510{ 870{
511 struct hrtimer_base *base; 871 struct hrtimer_clock_base *base;
512 unsigned long flags; 872 unsigned long flags;
513 ktime_t rem; 873 ktime_t rem;
514 874
515 base = lock_hrtimer_base(timer, &flags); 875 base = lock_hrtimer_base(timer, &flags);
516 rem = ktime_sub(timer->expires, timer->base->get_time()); 876 rem = ktime_sub(timer->expires, base->get_time());
517 unlock_hrtimer_base(timer, &flags); 877 unlock_hrtimer_base(timer, &flags);
518 878
519 return rem; 879 return rem;
520} 880}
521EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 881EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
522 882
523#ifdef CONFIG_NO_IDLE_HZ 883#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
524/** 884/**
525 * hrtimer_get_next_event - get the time until next expiry event 885 * hrtimer_get_next_event - get the time until next expiry event
526 * 886 *
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
529 */ 889 */
530ktime_t hrtimer_get_next_event(void) 890ktime_t hrtimer_get_next_event(void)
531{ 891{
532 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 892 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
893 struct hrtimer_clock_base *base = cpu_base->clock_base;
533 ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 894 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
534 unsigned long flags; 895 unsigned long flags;
535 int i; 896 int i;
536 897
537 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 898 spin_lock_irqsave(&cpu_base->lock, flags);
538 struct hrtimer *timer;
539 899
540 spin_lock_irqsave(&base->lock, flags); 900 if (!hrtimer_hres_active()) {
541 if (!base->first) { 901 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
542 spin_unlock_irqrestore(&base->lock, flags); 902 struct hrtimer *timer;
543 continue; 903
904 if (!base->first)
905 continue;
906
907 timer = rb_entry(base->first, struct hrtimer, node);
908 delta.tv64 = timer->expires.tv64;
909 delta = ktime_sub(delta, base->get_time());
910 if (delta.tv64 < mindelta.tv64)
911 mindelta.tv64 = delta.tv64;
544 } 912 }
545 timer = rb_entry(base->first, struct hrtimer, node);
546 delta.tv64 = timer->expires.tv64;
547 spin_unlock_irqrestore(&base->lock, flags);
548 delta = ktime_sub(delta, base->get_time());
549 if (delta.tv64 < mindelta.tv64)
550 mindelta.tv64 = delta.tv64;
551 } 913 }
914
915 spin_unlock_irqrestore(&cpu_base->lock, flags);
916
552 if (mindelta.tv64 < 0) 917 if (mindelta.tv64 < 0)
553 mindelta.tv64 = 0; 918 mindelta.tv64 = 0;
554 return mindelta; 919 return mindelta;
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void)
564void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 929void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
565 enum hrtimer_mode mode) 930 enum hrtimer_mode mode)
566{ 931{
567 struct hrtimer_base *bases; 932 struct hrtimer_cpu_base *cpu_base;
568 933
569 memset(timer, 0, sizeof(struct hrtimer)); 934 memset(timer, 0, sizeof(struct hrtimer));
570 935
571 bases = __raw_get_cpu_var(hrtimer_bases); 936 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
572 937
573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 938 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
574 clock_id = CLOCK_MONOTONIC; 939 clock_id = CLOCK_MONOTONIC;
575 940
576 timer->base = &bases[clock_id]; 941 timer->base = &cpu_base->clock_base[clock_id];
577 rb_set_parent(&timer->node, &timer->node); 942 hrtimer_init_timer_hres(timer);
943
944#ifdef CONFIG_TIMER_STATS
945 timer->start_site = NULL;
946 timer->start_pid = -1;
947 memset(timer->start_comm, 0, TASK_COMM_LEN);
948#endif
578} 949}
579EXPORT_SYMBOL_GPL(hrtimer_init); 950EXPORT_SYMBOL_GPL(hrtimer_init);
580 951
@@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
588 */ 959 */
589int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 960int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
590{ 961{
591 struct hrtimer_base *bases; 962 struct hrtimer_cpu_base *cpu_base;
592 963
593 bases = __raw_get_cpu_var(hrtimer_bases); 964 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
594 *tp = ktime_to_timespec(bases[which_clock].resolution); 965 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
595 966
596 return 0; 967 return 0;
597} 968}
598EXPORT_SYMBOL_GPL(hrtimer_get_res); 969EXPORT_SYMBOL_GPL(hrtimer_get_res);
599 970
971#ifdef CONFIG_HIGH_RES_TIMERS
972
973/*
974 * High resolution timer interrupt
975 * Called with interrupts disabled
976 */
977void hrtimer_interrupt(struct clock_event_device *dev)
978{
979 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
980 struct hrtimer_clock_base *base;
981 ktime_t expires_next, now;
982 int i, raise = 0;
983
984 BUG_ON(!cpu_base->hres_active);
985 cpu_base->nr_events++;
986 dev->next_event.tv64 = KTIME_MAX;
987
988 retry:
989 now = ktime_get();
990
991 expires_next.tv64 = KTIME_MAX;
992
993 base = cpu_base->clock_base;
994
995 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
996 ktime_t basenow;
997 struct rb_node *node;
998
999 spin_lock(&cpu_base->lock);
1000
1001 basenow = ktime_add(now, base->offset);
1002
1003 while ((node = base->first)) {
1004 struct hrtimer *timer;
1005
1006 timer = rb_entry(node, struct hrtimer, node);
1007
1008 if (basenow.tv64 < timer->expires.tv64) {
1009 ktime_t expires;
1010
1011 expires = ktime_sub(timer->expires,
1012 base->offset);
1013 if (expires.tv64 < expires_next.tv64)
1014 expires_next = expires;
1015 break;
1016 }
1017
1018 /* Move softirq callbacks to the pending list */
1019 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1020 __remove_hrtimer(timer, base,
1021 HRTIMER_STATE_PENDING, 0);
1022 list_add_tail(&timer->cb_entry,
1023 &base->cpu_base->cb_pending);
1024 raise = 1;
1025 continue;
1026 }
1027
1028 __remove_hrtimer(timer, base,
1029 HRTIMER_STATE_CALLBACK, 0);
1030 timer_stats_account_hrtimer(timer);
1031
1032 /*
1033 * Note: We clear the CALLBACK bit after
1034 * enqueue_hrtimer to avoid reprogramming of
1035 * the event hardware. This happens at the end
1036 * of this function anyway.
1037 */
1038 if (timer->function(timer) != HRTIMER_NORESTART) {
1039 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1040 enqueue_hrtimer(timer, base, 0);
1041 }
1042 timer->state &= ~HRTIMER_STATE_CALLBACK;
1043 }
1044 spin_unlock(&cpu_base->lock);
1045 base++;
1046 }
1047
1048 cpu_base->expires_next = expires_next;
1049
1050 /* Reprogramming necessary ? */
1051 if (expires_next.tv64 != KTIME_MAX) {
1052 if (tick_program_event(expires_next, 0))
1053 goto retry;
1054 }
1055
1056 /* Raise softirq ? */
1057 if (raise)
1058 raise_softirq(HRTIMER_SOFTIRQ);
1059}
1060
1061static void run_hrtimer_softirq(struct softirq_action *h)
1062{
1063 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1064
1065 spin_lock_irq(&cpu_base->lock);
1066
1067 while (!list_empty(&cpu_base->cb_pending)) {
1068 enum hrtimer_restart (*fn)(struct hrtimer *);
1069 struct hrtimer *timer;
1070 int restart;
1071
1072 timer = list_entry(cpu_base->cb_pending.next,
1073 struct hrtimer, cb_entry);
1074
1075 timer_stats_account_hrtimer(timer);
1076
1077 fn = timer->function;
1078 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1079 spin_unlock_irq(&cpu_base->lock);
1080
1081 restart = fn(timer);
1082
1083 spin_lock_irq(&cpu_base->lock);
1084
1085 timer->state &= ~HRTIMER_STATE_CALLBACK;
1086 if (restart == HRTIMER_RESTART) {
1087 BUG_ON(hrtimer_active(timer));
1088 /*
1089 * Enqueue the timer, allow reprogramming of the event
1090 * device
1091 */
1092 enqueue_hrtimer(timer, timer->base, 1);
1093 } else if (hrtimer_active(timer)) {
1094 /*
1095 * If the timer was rearmed on another CPU, reprogram
1096 * the event device.
1097 */
1098 if (timer->base->first == &timer->node)
1099 hrtimer_reprogram(timer, timer->base);
1100 }
1101 }
1102 spin_unlock_irq(&cpu_base->lock);
1103}
1104
1105#endif /* CONFIG_HIGH_RES_TIMERS */
1106
600/* 1107/*
601 * Expire the per base hrtimer-queue: 1108 * Expire the per base hrtimer-queue:
602 */ 1109 */
603static inline void run_hrtimer_queue(struct hrtimer_base *base) 1110static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1111 int index)
604{ 1112{
605 struct rb_node *node; 1113 struct rb_node *node;
1114 struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
606 1115
607 if (!base->first) 1116 if (!base->first)
608 return; 1117 return;
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
610 if (base->get_softirq_time) 1119 if (base->get_softirq_time)
611 base->softirq_time = base->get_softirq_time(); 1120 base->softirq_time = base->get_softirq_time();
612 1121
613 spin_lock_irq(&base->lock); 1122 spin_lock_irq(&cpu_base->lock);
614 1123
615 while ((node = base->first)) { 1124 while ((node = base->first)) {
616 struct hrtimer *timer; 1125 struct hrtimer *timer;
617 int (*fn)(struct hrtimer *); 1126 enum hrtimer_restart (*fn)(struct hrtimer *);
618 int restart; 1127 int restart;
619 1128
620 timer = rb_entry(node, struct hrtimer, node); 1129 timer = rb_entry(node, struct hrtimer, node);
621 if (base->softirq_time.tv64 <= timer->expires.tv64) 1130 if (base->softirq_time.tv64 <= timer->expires.tv64)
622 break; 1131 break;
623 1132
1133 timer_stats_account_hrtimer(timer);
1134
624 fn = timer->function; 1135 fn = timer->function;
625 set_curr_timer(base, timer); 1136 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
626 __remove_hrtimer(timer, base); 1137 spin_unlock_irq(&cpu_base->lock);
627 spin_unlock_irq(&base->lock);
628 1138
629 restart = fn(timer); 1139 restart = fn(timer);
630 1140
631 spin_lock_irq(&base->lock); 1141 spin_lock_irq(&cpu_base->lock);
632 1142
1143 timer->state &= ~HRTIMER_STATE_CALLBACK;
633 if (restart != HRTIMER_NORESTART) { 1144 if (restart != HRTIMER_NORESTART) {
634 BUG_ON(hrtimer_active(timer)); 1145 BUG_ON(hrtimer_active(timer));
635 enqueue_hrtimer(timer, base); 1146 enqueue_hrtimer(timer, base, 0);
636 } 1147 }
637 } 1148 }
638 set_curr_timer(base, NULL); 1149 spin_unlock_irq(&cpu_base->lock);
639 spin_unlock_irq(&base->lock);
640} 1150}
641 1151
642/* 1152/*
643 * Called from timer softirq every jiffy, expire hrtimers: 1153 * Called from timer softirq every jiffy, expire hrtimers:
1154 *
1155 * For HRT its the fall back code to run the softirq in the timer
1156 * softirq context in case the hrtimer initialization failed or has
1157 * not been done yet.
644 */ 1158 */
645void hrtimer_run_queues(void) 1159void hrtimer_run_queues(void)
646{ 1160{
647 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 1161 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
648 int i; 1162 int i;
649 1163
650 hrtimer_get_softirq_time(base); 1164 if (hrtimer_hres_active())
1165 return;
1166
1167 /*
1168 * This _is_ ugly: We have to check in the softirq context,
1169 * whether we can switch to highres and / or nohz mode. The
1170 * clocksource switch happens in the timer interrupt with
1171 * xtime_lock held. Notification from there only sets the
1172 * check bit in the tick_oneshot code, otherwise we might
1173 * deadlock vs. xtime_lock.
1174 */
1175 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1176 hrtimer_switch_to_hres();
651 1177
652 for (i = 0; i < MAX_HRTIMER_BASES; i++) 1178 hrtimer_get_softirq_time(cpu_base);
653 run_hrtimer_queue(&base[i]); 1179
1180 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1181 run_hrtimer_queue(cpu_base, i);
654} 1182}
655 1183
656/* 1184/*
657 * Sleep related functions: 1185 * Sleep related functions:
658 */ 1186 */
659static int hrtimer_wakeup(struct hrtimer *timer) 1187static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
660{ 1188{
661 struct hrtimer_sleeper *t = 1189 struct hrtimer_sleeper *t =
662 container_of(timer, struct hrtimer_sleeper, timer); 1190 container_of(timer, struct hrtimer_sleeper, timer);
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
673{ 1201{
674 sl->timer.function = hrtimer_wakeup; 1202 sl->timer.function = hrtimer_wakeup;
675 sl->task = task; 1203 sl->task = task;
1204#ifdef CONFIG_HIGH_RES_TIMERS
1205 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
1206#endif
676} 1207}
677 1208
678static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1209static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
683 set_current_state(TASK_INTERRUPTIBLE); 1214 set_current_state(TASK_INTERRUPTIBLE);
684 hrtimer_start(&t->timer, t->timer.expires, mode); 1215 hrtimer_start(&t->timer, t->timer.expires, mode);
685 1216
686 schedule(); 1217 if (likely(t->task))
1218 schedule();
687 1219
688 hrtimer_cancel(&t->timer); 1220 hrtimer_cancel(&t->timer);
689 mode = HRTIMER_ABS; 1221 mode = HRTIMER_MODE_ABS;
690 1222
691 } while (t->task && !signal_pending(current)); 1223 } while (t->task && !signal_pending(current));
692 1224
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
702 1234
703 restart->fn = do_no_restart_syscall; 1235 restart->fn = do_no_restart_syscall;
704 1236
705 hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); 1237 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
706 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1238 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
707 1239
708 if (do_nanosleep(&t, HRTIMER_ABS)) 1240 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
709 return 0; 1241 return 0;
710 1242
711 rmtp = (struct timespec __user *) restart->arg1; 1243 rmtp = (struct timespec __user *) restart->arg1;
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
738 return 0; 1270 return 0;
739 1271
740 /* Absolute timers do not update the rmtp value and restart: */ 1272 /* Absolute timers do not update the rmtp value and restart: */
741 if (mode == HRTIMER_ABS) 1273 if (mode == HRTIMER_MODE_ABS)
742 return -ERESTARTNOHAND; 1274 return -ERESTARTNOHAND;
743 1275
744 if (rmtp) { 1276 if (rmtp) {
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
771 if (!timespec_valid(&tu)) 1303 if (!timespec_valid(&tu))
772 return -EINVAL; 1304 return -EINVAL;
773 1305
774 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); 1306 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
775} 1307}
776 1308
777/* 1309/*
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
779 */ 1311 */
780static void __devinit init_hrtimers_cpu(int cpu) 1312static void __devinit init_hrtimers_cpu(int cpu)
781{ 1313{
782 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); 1314 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
783 int i; 1315 int i;
784 1316
785 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 1317 spin_lock_init(&cpu_base->lock);
786 spin_lock_init(&base->lock); 1318 lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
787 lockdep_set_class(&base->lock, &base->lock_key); 1319
788 } 1320 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1321 cpu_base->clock_base[i].cpu_base = cpu_base;
1322
1323 hrtimer_init_hres(cpu_base);
789} 1324}
790 1325
791#ifdef CONFIG_HOTPLUG_CPU 1326#ifdef CONFIG_HOTPLUG_CPU
792 1327
793static void migrate_hrtimer_list(struct hrtimer_base *old_base, 1328static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
794 struct hrtimer_base *new_base) 1329 struct hrtimer_clock_base *new_base)
795{ 1330{
796 struct hrtimer *timer; 1331 struct hrtimer *timer;
797 struct rb_node *node; 1332 struct rb_node *node;
798 1333
799 while ((node = rb_first(&old_base->active))) { 1334 while ((node = rb_first(&old_base->active))) {
800 timer = rb_entry(node, struct hrtimer, node); 1335 timer = rb_entry(node, struct hrtimer, node);
801 __remove_hrtimer(timer, old_base); 1336 BUG_ON(hrtimer_callback_running(timer));
1337 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
802 timer->base = new_base; 1338 timer->base = new_base;
803 enqueue_hrtimer(timer, new_base); 1339 /*
1340 * Enqueue the timer. Allow reprogramming of the event device
1341 */
1342 enqueue_hrtimer(timer, new_base, 1);
804 } 1343 }
805} 1344}
806 1345
807static void migrate_hrtimers(int cpu) 1346static void migrate_hrtimers(int cpu)
808{ 1347{
809 struct hrtimer_base *old_base, *new_base; 1348 struct hrtimer_cpu_base *old_base, *new_base;
810 int i; 1349 int i;
811 1350
812 BUG_ON(cpu_online(cpu)); 1351 BUG_ON(cpu_online(cpu));
813 old_base = per_cpu(hrtimer_bases, cpu); 1352 old_base = &per_cpu(hrtimer_bases, cpu);
814 new_base = get_cpu_var(hrtimer_bases); 1353 new_base = &get_cpu_var(hrtimer_bases);
815
816 local_irq_disable();
817 1354
818 for (i = 0; i < MAX_HRTIMER_BASES; i++) { 1355 tick_cancel_sched_timer(cpu);
819 1356
820 spin_lock(&new_base->lock); 1357 local_irq_disable();
821 spin_lock(&old_base->lock);
822
823 BUG_ON(old_base->curr_timer);
824 1358
825 migrate_hrtimer_list(old_base, new_base); 1359 spin_lock(&new_base->lock);
1360 spin_lock(&old_base->lock);
826 1361
827 spin_unlock(&old_base->lock); 1362 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
828 spin_unlock(&new_base->lock); 1363 migrate_hrtimer_list(&old_base->clock_base[i],
829 old_base++; 1364 &new_base->clock_base[i]);
830 new_base++;
831 } 1365 }
1366 spin_unlock(&old_base->lock);
1367 spin_unlock(&new_base->lock);
832 1368
833 local_irq_enable(); 1369 local_irq_enable();
834 put_cpu_var(hrtimer_bases); 1370 put_cpu_var(hrtimer_bases);
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
848 1384
849#ifdef CONFIG_HOTPLUG_CPU 1385#ifdef CONFIG_HOTPLUG_CPU
850 case CPU_DEAD: 1386 case CPU_DEAD:
1387 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
851 migrate_hrtimers(cpu); 1388 migrate_hrtimers(cpu);
852 break; 1389 break;
853#endif 1390#endif
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void)
868 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1405 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
869 (void *)(long)smp_processor_id()); 1406 (void *)(long)smp_processor_id());
870 register_cpu_notifier(&hrtimers_nb); 1407 register_cpu_notifier(&hrtimers_nb);
1408#ifdef CONFIG_HIGH_RES_TIMERS
1409 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
1410#endif
871} 1411}
872 1412
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 475e8a71bcdc..0133f4f9e9f0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data);
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_data - set irq type data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @data: Pointer to interrupt specific data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the hardware irq controller data for an irq
174 */ 174 */
@@ -230,10 +230,6 @@ static void default_enable(unsigned int irq)
230 */ 230 */
231static void default_disable(unsigned int irq) 231static void default_disable(unsigned int irq)
232{ 232{
233 struct irq_desc *desc = irq_desc + irq;
234
235 if (!(desc->status & IRQ_DELAYED_DISABLE))
236 desc->chip->mask(irq);
237} 233}
238 234
239/* 235/*
@@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
298 294
299 if (unlikely(desc->status & IRQ_INPROGRESS)) 295 if (unlikely(desc->status & IRQ_INPROGRESS))
300 goto out_unlock; 296 goto out_unlock;
301 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
302 kstat_cpu(cpu).irqs[irq]++; 297 kstat_cpu(cpu).irqs[irq]++;
303 298
304 action = desc->action; 299 action = desc->action;
305 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 300 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
301 if (desc->chip->mask)
302 desc->chip->mask(irq);
303 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
304 desc->status |= IRQ_PENDING;
306 goto out_unlock; 305 goto out_unlock;
306 }
307 307
308 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
308 desc->status |= IRQ_INPROGRESS; 309 desc->status |= IRQ_INPROGRESS;
309 spin_unlock(&desc->lock); 310 spin_unlock(&desc->lock);
310 311
@@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
396 397
397 /* 398 /*
398 * If its disabled or no action available 399 * If its disabled or no action available
399 * keep it masked and get out of here 400 * then mask it and get out of here:
400 */ 401 */
401 action = desc->action; 402 action = desc->action;
402 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 403 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
403 desc->status |= IRQ_PENDING; 404 desc->status |= IRQ_PENDING;
405 if (desc->chip->mask)
406 desc->chip->mask(irq);
404 goto out; 407 goto out;
405 } 408 }
406 409
@@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
562 565
563 /* Uninstall? */ 566 /* Uninstall? */
564 if (handle == handle_bad_irq) { 567 if (handle == handle_bad_irq) {
565 if (desc->chip != &no_irq_chip) { 568 if (desc->chip != &no_irq_chip)
566 desc->chip->mask(irq); 569 mask_ack_irq(desc, irq);
567 desc->chip->ack(irq);
568 }
569 desc->status |= IRQ_DISABLED; 570 desc->status |= IRQ_DISABLED;
570 desc->depth = 1; 571 desc->depth = 1;
571 } 572 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index acc5d9fe462b..5597c157442a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq)
38} 38}
39EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
40 40
41/**
42 * irq_can_set_affinity - Check if the affinity of a given irq can be set
43 * @irq: Interrupt to check
44 *
45 */
46int irq_can_set_affinity(unsigned int irq)
47{
48 struct irq_desc *desc = irq_desc + irq;
49
50 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
51 !desc->chip->set_affinity)
52 return 0;
53
54 return 1;
55}
56
57/**
58 * irq_set_affinity - Set the irq affinity of a given irq
59 * @irq: Interrupt to set affinity
60 * @cpumask: cpumask
61 *
62 */
63int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
64{
65 struct irq_desc *desc = irq_desc + irq;
66
67 if (!desc->chip->set_affinity)
68 return -EINVAL;
69
70 set_balance_irq_affinity(irq, cpumask);
71
72#ifdef CONFIG_GENERIC_PENDING_IRQ
73 set_pending_irq(irq, cpumask);
74#else
75 desc->affinity = cpumask;
76 desc->chip->set_affinity(irq, cpumask);
77#endif
78 return 0;
79}
80
41#endif 81#endif
42 82
43/** 83/**
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
281 if (new->flags & IRQF_PERCPU) 321 if (new->flags & IRQF_PERCPU)
282 desc->status |= IRQ_PER_CPU; 322 desc->status |= IRQ_PER_CPU;
283#endif 323#endif
324 /* Exclude IRQ from balancing */
325 if (new->flags & IRQF_NOBALANCING)
326 desc->status |= IRQ_NO_BALANCING;
327
284 if (!shared) { 328 if (!shared) {
285 irq_chip_set_defaults(desc->chip); 329 irq_chip_set_defaults(desc->chip);
286 330
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6d3be06e8ce6..2db91eb54ad8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19#ifdef CONFIG_GENERIC_PENDING_IRQ
20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
21{
22 set_balance_irq_affinity(irq, mask_val);
23
24 /*
25 * Save these away for later use. Re-progam when the
26 * interrupt is pending
27 */
28 set_pending_irq(irq, mask_val);
29}
30#else
31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
32{
33 set_balance_irq_affinity(irq, mask_val);
34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
36}
37#endif
38
39static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
40 int count, int *eof, void *data) 20 int count, int *eof, void *data)
41{ 21{
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
55 cpumask_t new_value, tmp; 35 cpumask_t new_value, tmp;
56 36
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 37 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status)) 38 irq_balancing_disabled(irq))
59 return -EIO; 39 return -EIO;
60 40
61 err = cpumask_parse_user(buffer, count, new_value); 41 err = cpumask_parse_user(buffer, count, new_value);
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
73 code to set default SMP affinity. */ 53 code to set default SMP affinity. */
74 return select_smp_affinity(irq) ? -EINVAL : full_count; 54 return select_smp_affinity(irq) ? -EINVAL : full_count;
75 55
76 proc_set_irq_affinity(irq, new_value); 56 irq_set_affinity(irq, new_value);
77 57
78 return full_count; 58 return full_count;
79} 59}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 204ed7939e75..307c6a632ef6 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(struct hrtimer *timer) 131enum hrtimer_restart it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct signal_struct *sig = 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer); 134 container_of(timer, struct signal_struct, real_timer);
135 135
136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
137 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART;
142 }
143 return HRTIMER_NORESTART; 138 return HRTIMER_NORESTART;
144} 139}
145 140
@@ -231,11 +226,14 @@ again:
231 spin_unlock_irq(&tsk->sighand->siglock); 226 spin_unlock_irq(&tsk->sighand->siglock);
232 goto again; 227 goto again;
233 } 228 }
234 tsk->signal->it_real_incr =
235 timeval_to_ktime(value->it_interval);
236 expires = timeval_to_ktime(value->it_value); 229 expires = timeval_to_ktime(value->it_value);
237 if (expires.tv64 != 0) 230 if (expires.tv64 != 0) {
238 hrtimer_start(timer, expires, HRTIMER_REL); 231 tsk->signal->it_real_incr =
232 timeval_to_ktime(value->it_interval);
233 hrtimer_start(timer, expires, HRTIMER_MODE_REL);
234 } else
235 tsk->signal->it_real_incr.tv64 = 0;
236
239 spin_unlock_irq(&tsk->sighand->siglock); 237 spin_unlock_irq(&tsk->sighand->siglock);
240 break; 238 break;
241 case ITIMER_VIRTUAL: 239 case ITIMER_VIRTUAL:
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c3e1e6dfb5b..657f77697415 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
304 * should be able to see it. 304 * should be able to see it.
305 */ 305 */
306 struct task_struct *p; 306 struct task_struct *p;
307 read_lock(&tasklist_lock); 307 rcu_read_lock();
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else if (p->tgid == pid && p->signal) { 315 } else {
316 error = cpu_clock_sample_group(which_clock, 316 read_lock(&tasklist_lock);
317 p, &rtn); 317 if (p->tgid == pid && p->signal) {
318 error =
319 cpu_clock_sample_group(which_clock,
320 p, &rtn);
321 }
322 read_unlock(&tasklist_lock);
318 } 323 }
319 } 324 }
320 read_unlock(&tasklist_lock); 325 rcu_read_unlock();
321 } 326 }
322 327
323 if (error) 328 if (error)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a1bf61617839..44318ca71978 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
145 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
146static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
147 147
148static int posix_timer_fn(struct hrtimer *data); 148static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
149 149
150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
151 151
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
334 334
335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
336 */ 336 */
337static int posix_timer_fn(struct hrtimer *timer) 337static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
338{ 338{
339 struct k_itimer *timr; 339 struct k_itimer *timr;
340 unsigned long flags; 340 unsigned long flags;
341 int si_private = 0; 341 int si_private = 0;
342 int ret = HRTIMER_NORESTART; 342 enum hrtimer_restart ret = HRTIMER_NORESTART;
343 343
344 timr = container_of(timer, struct k_itimer, it.real.timer); 344 timr = container_of(timer, struct k_itimer, it.real.timer);
345 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer)
356 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
357 timr->it_overrun += 357 timr->it_overrun +=
358 hrtimer_forward(timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time, 359 hrtimer_cb_get_time(timer),
360 timr->it.real.interval); 360 timr->it.real.interval);
361 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
362 ++timr->it_requeue_pending; 362 ++timr->it_requeue_pending;
@@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags,
722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) 722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
723 return 0; 723 return 0;
724 724
725 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 725 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
727 timr->it.real.timer.function = posix_timer_fn; 727 timr->it.real.timer.function = posix_timer_fn;
728 728
@@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags,
734 /* SIGEV_NONE timers are not queued ! See common_timer_get */ 734 /* SIGEV_NONE timers are not queued ! See common_timer_get */
735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
736 /* Setup correct expiry time for relative timers */ 736 /* Setup correct expiry time for relative timers */
737 if (mode == HRTIMER_REL) 737 if (mode == HRTIMER_MODE_REL)
738 timer->expires = ktime_add(timer->expires, 738 timer->expires = ktime_add(timer->expires,
739 timer->base->get_time()); 739 timer->base->get_time());
740 return 0; 740 return 0;
@@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
950 struct timespec *tsave, struct timespec __user *rmtp) 950 struct timespec *tsave, struct timespec __user *rmtp)
951{ 951{
952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
953 HRTIMER_ABS : HRTIMER_REL, which_clock); 953 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
954 which_clock);
954} 955}
955 956
956asmlinkage long 957asmlinkage long
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da46fd8..180978cb2f75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
625 /* Setup the timer, when timeout != NULL */ 625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout)) 626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires, 627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS); 628 HRTIMER_MODE_ABS);
629 629
630 for (;;) { 630 for (;;) {
631 /* Try to acquire the lock: */ 631 /* Try to acquire the lock: */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8072e568bbe0..e2a7d4bf7d57 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
457{ 457{
458 int signr = __dequeue_signal(&tsk->pending, mask, info); 458 int signr = __dequeue_signal(&tsk->pending, mask, info);
459 if (!signr) 459 if (!signr) {
460 signr = __dequeue_signal(&tsk->signal->shared_pending, 460 signr = __dequeue_signal(&tsk->signal->shared_pending,
461 mask, info); 461 mask, info);
462 /*
463 * itimer signal ?
464 *
465 * itimers are process shared and we restart periodic
466 * itimers in the signal delivery path to prevent DoS
467 * attacks in the high resolution timer case. This is
468 * compliant with the old way of self restarting
469 * itimers, as the SIGALRM is a legacy signal and only
470 * queued once. Changing the restart behaviour to
471 * restart the timer in the signal dequeue path is
472 * reducing the timer noise on heavy loaded !highres
473 * systems too.
474 */
475 if (unlikely(signr == SIGALRM)) {
476 struct hrtimer *tmr = &tsk->signal->real_timer;
477
478 if (!hrtimer_is_queued(tmr) &&
479 tsk->signal->it_real_incr.tv64 != 0) {
480 hrtimer_forward(tmr, tmr->base->get_time(),
481 tsk->signal->it_real_incr);
482 hrtimer_restart(tmr);
483 }
484 }
485 }
462 recalc_sigpending_tsk(tsk); 486 recalc_sigpending_tsk(tsk);
463 if (signr && unlikely(sig_kernel_stop(signr))) { 487 if (signr && unlikely(sig_kernel_stop(signr))) {
464 /* 488 /*
465 * Set a marker that we have dequeued a stop signal. Our 489 * Set a marker that we have dequeued a stop signal. Our
466 * caller might release the siglock and then the pending 490 * caller might release the siglock and then the pending
467 * stop signal it is about to process is no longer in the 491 * stop signal it is about to process is no longer in the
468 * pending bitmasks, but must still be cleared by a SIGCONT 492 * pending bitmasks, but must still be cleared by a SIGCONT
469 * (and overruled by a SIGKILL). So those cases clear this 493 * (and overruled by a SIGKILL). So those cases clear this
470 * shared flag after we've set it. Note that this flag may 494 * shared flag after we've set it. Note that this flag may
471 * remain set after the signal we return is ignored or 495 * remain set after the signal we return is ignored or
472 * handled. That doesn't matter because its only purpose 496 * handled. That doesn't matter because its only purpose
473 * is to alert stop-signal processing code when another 497 * is to alert stop-signal processing code when another
474 * processor has come along and cleared the flag. 498 * processor has come along and cleared the flag.
475 */ 499 */
476 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 500 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
477 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 501 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
478 } 502 }
479 if ( signr && 503 if ( signr &&
480 ((info->si_code & __SI_MASK) == __SI_TIMER) && 504 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
481 info->si_sys_private){ 505 info->si_sys_private){
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 918e52df090e..8b75008e2bd8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -17,6 +17,7 @@
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/tick.h>
20 21
21#include <asm/irq.h> 22#include <asm/irq.h>
22/* 23/*
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq);
273 274
274#endif 275#endif
275 276
277/*
278 * Enter an interrupt context.
279 */
280void irq_enter(void)
281{
282 __irq_enter();
283#ifdef CONFIG_NO_HZ
284 if (idle_cpu(smp_processor_id()))
285 tick_nohz_update_jiffies();
286#endif
287}
288
276#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 289#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
277# define invoke_softirq() __do_softirq() 290# define invoke_softirq() __do_softirq()
278#else 291#else
@@ -289,6 +302,12 @@ void irq_exit(void)
289 sub_preempt_count(IRQ_EXIT_OFFSET); 302 sub_preempt_count(IRQ_EXIT_OFFSET);
290 if (!in_interrupt() && local_softirq_pending()) 303 if (!in_interrupt() && local_softirq_pending())
291 invoke_softirq(); 304 invoke_softirq();
305
306#ifdef CONFIG_NO_HZ
307 /* Make sure that timer wheel updates are propagated */
308 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
309 tick_nohz_stop_sched_tick();
310#endif
292 preempt_enable_no_resched(); 311 preempt_enable_no_resched();
293} 312}
294 313
diff --git a/kernel/time.c b/kernel/time.c
index 0e017bff4c19..c6c80ea5d0ea 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec)
470 return tv; 470 return tv;
471} 471}
472 472
473/*
474 * Convert jiffies to milliseconds and back.
475 *
476 * Avoid unnecessary multiplications/divisions in the
477 * two most common HZ cases:
478 */
479unsigned int jiffies_to_msecs(const unsigned long j)
480{
481#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
482 return (MSEC_PER_SEC / HZ) * j;
483#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
484 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
485#else
486 return (j * MSEC_PER_SEC) / HZ;
487#endif
488}
489EXPORT_SYMBOL(jiffies_to_msecs);
490
491unsigned int jiffies_to_usecs(const unsigned long j)
492{
493#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
494 return (USEC_PER_SEC / HZ) * j;
495#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
496 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
497#else
498 return (j * USEC_PER_SEC) / HZ;
499#endif
500}
501EXPORT_SYMBOL(jiffies_to_usecs);
502
503/*
504 * When we convert to jiffies then we interpret incoming values
505 * the following way:
506 *
507 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
508 *
509 * - 'too large' values [that would result in larger than
510 * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
511 *
512 * - all other values are converted to jiffies by either multiplying
513 * the input value by a factor or dividing it with a factor
514 *
515 * We must also be careful about 32-bit overflows.
516 */
517unsigned long msecs_to_jiffies(const unsigned int m)
518{
519 /*
520 * Negative value, means infinite timeout:
521 */
522 if ((int)m < 0)
523 return MAX_JIFFY_OFFSET;
524
525#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
526 /*
527 * HZ is equal to or smaller than 1000, and 1000 is a nice
528 * round multiple of HZ, divide with the factor between them,
529 * but round upwards:
530 */
531 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
532#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
533 /*
534 * HZ is larger than 1000, and HZ is a nice round multiple of
535 * 1000 - simply multiply with the factor between them.
536 *
537 * But first make sure the multiplication result cannot
538 * overflow:
539 */
540 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
541 return MAX_JIFFY_OFFSET;
542
543 return m * (HZ / MSEC_PER_SEC);
544#else
545 /*
546 * Generic case - multiply, round and divide. But first
547 * check that if we are doing a net multiplication, that
548 * we wouldnt overflow:
549 */
550 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
551 return MAX_JIFFY_OFFSET;
552
553 return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
554#endif
555}
556EXPORT_SYMBOL(msecs_to_jiffies);
557
558unsigned long usecs_to_jiffies(const unsigned int u)
559{
560 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
561 return MAX_JIFFY_OFFSET;
562#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
563 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
564#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
565 return u * (HZ / USEC_PER_SEC);
566#else
567 return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
568#endif
569}
570EXPORT_SYMBOL(usecs_to_jiffies);
571
572/*
573 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
574 * that a remainder subtract here would not do the right thing as the
575 * resolution values don't fall on second boundries. I.e. the line:
576 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
577 *
578 * Rather, we just shift the bits off the right.
579 *
580 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
581 * value to a scaled second value.
582 */
583unsigned long
584timespec_to_jiffies(const struct timespec *value)
585{
586 unsigned long sec = value->tv_sec;
587 long nsec = value->tv_nsec + TICK_NSEC - 1;
588
589 if (sec >= MAX_SEC_IN_JIFFIES){
590 sec = MAX_SEC_IN_JIFFIES;
591 nsec = 0;
592 }
593 return (((u64)sec * SEC_CONVERSION) +
594 (((u64)nsec * NSEC_CONVERSION) >>
595 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
596
597}
598EXPORT_SYMBOL(timespec_to_jiffies);
599
600void
601jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
602{
603 /*
604 * Convert jiffies to nanoseconds and separate with
605 * one divide.
606 */
607 u64 nsec = (u64)jiffies * TICK_NSEC;
608 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
609}
610EXPORT_SYMBOL(jiffies_to_timespec);
611
612/* Same for "timeval"
613 *
614 * Well, almost. The problem here is that the real system resolution is
615 * in nanoseconds and the value being converted is in micro seconds.
616 * Also for some machines (those that use HZ = 1024, in-particular),
617 * there is a LARGE error in the tick size in microseconds.
618
619 * The solution we use is to do the rounding AFTER we convert the
620 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
621 * Instruction wise, this should cost only an additional add with carry
622 * instruction above the way it was done above.
623 */
624unsigned long
625timeval_to_jiffies(const struct timeval *value)
626{
627 unsigned long sec = value->tv_sec;
628 long usec = value->tv_usec;
629
630 if (sec >= MAX_SEC_IN_JIFFIES){
631 sec = MAX_SEC_IN_JIFFIES;
632 usec = 0;
633 }
634 return (((u64)sec * SEC_CONVERSION) +
635 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
636 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
637}
638
639void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
640{
641 /*
642 * Convert jiffies to nanoseconds and separate with
643 * one divide.
644 */
645 u64 nsec = (u64)jiffies * TICK_NSEC;
646 long tv_usec;
647
648 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
649 tv_usec /= NSEC_PER_USEC;
650 value->tv_usec = tv_usec;
651}
652
653/*
654 * Convert jiffies/jiffies_64 to clock_t and back.
655 */
656clock_t jiffies_to_clock_t(long x)
657{
658#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
659 return x / (HZ / USER_HZ);
660#else
661 u64 tmp = (u64)x * TICK_NSEC;
662 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
663 return (long)tmp;
664#endif
665}
666EXPORT_SYMBOL(jiffies_to_clock_t);
667
668unsigned long clock_t_to_jiffies(unsigned long x)
669{
670#if (HZ % USER_HZ)==0
671 if (x >= ~0UL / (HZ / USER_HZ))
672 return ~0UL;
673 return x * (HZ / USER_HZ);
674#else
675 u64 jif;
676
677 /* Don't worry about loss of precision here .. */
678 if (x >= ~0UL / HZ * USER_HZ)
679 return ~0UL;
680
681 /* .. but do try to contain it here */
682 jif = x * (u64) HZ;
683 do_div(jif, USER_HZ);
684 return jif;
685#endif
686}
687EXPORT_SYMBOL(clock_t_to_jiffies);
688
689u64 jiffies_64_to_clock_t(u64 x)
690{
691#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
692 do_div(x, HZ / USER_HZ);
693#else
694 /*
695 * There are better ways that don't overflow early,
696 * but even this doesn't overflow in hundreds of years
697 * in 64 bits, so..
698 */
699 x *= TICK_NSEC;
700 do_div(x, (NSEC_PER_SEC / USER_HZ));
701#endif
702 return x;
703}
704
705EXPORT_SYMBOL(jiffies_64_to_clock_t);
706
707u64 nsec_to_clock_t(u64 x)
708{
709#if (NSEC_PER_SEC % USER_HZ) == 0
710 do_div(x, (NSEC_PER_SEC / USER_HZ));
711#elif (USER_HZ % 512) == 0
712 x *= USER_HZ/512;
713 do_div(x, (NSEC_PER_SEC / 512));
714#else
715 /*
716 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
717 * overflow after 64.99 years.
718 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
719 */
720 x *= 9;
721 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
722 USER_HZ));
723#endif
724 return x;
725}
726
473#if (BITS_PER_LONG < 64) 727#if (BITS_PER_LONG < 64)
474u64 get_jiffies_64(void) 728u64 get_jiffies_64(void)
475{ 729{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000000..f66351126544
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
1#
2# Timer subsystem related configuration options
3#
4config TICK_ONESHOT
5 bool
6 default n
7
8config NO_HZ
9 bool "Tickless System (Dynamic Ticks)"
10 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
11 select TICK_ONESHOT
12 help
13 This option enables a tickless system: timer interrupts will
14 only trigger on an as-needed basis both when the system is
15 busy and when the system is idle.
16
17config HIGH_RES_TIMERS
18 bool "High Resolution Timer Support"
19 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
20 select TICK_ONESHOT
21 help
22 This option enables high resolution timer support. If your
23 hardware is not capable then this option only increases
24 the size of the kernel image.
25
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16fb..93bccba1f265 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
1obj-y += ntp.o clocksource.o jiffies.o 1obj-y += ntp.o clocksource.o jiffies.o timer_list.o
2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
8obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000000..67932ea78c17
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
1/*
2 * linux/kernel/time/clockevents.c
3 *
4 * This file contains functions which manage clock event devices.
5 *
6 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
8 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
9 *
10 * This code is licenced under the GPL version 2. For details see
11 * kernel-base/COPYING.
12 */
13
14#include <linux/clockchips.h>
15#include <linux/hrtimer.h>
16#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h>
20#include <linux/sysdev.h>
21
22/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices);
24static LIST_HEAD(clockevents_released);
25
26/* Notification for clock events */
27static RAW_NOTIFIER_HEAD(clockevents_chain);
28
29/* Protection for the above */
30static DEFINE_SPINLOCK(clockevents_lock);
31
32/**
33 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
34 * @latch: value to convert
35 * @evt: pointer to clock event device descriptor
36 *
37 * Math helper, returns latch value converted to nanoseconds (bound checked)
38 */
39unsigned long clockevent_delta2ns(unsigned long latch,
40 struct clock_event_device *evt)
41{
42 u64 clc = ((u64) latch << evt->shift);
43
44 do_div(clc, evt->mult);
45 if (clc < 1000)
46 clc = 1000;
47 if (clc > LONG_MAX)
48 clc = LONG_MAX;
49
50 return (unsigned long) clc;
51}
52
53/**
54 * clockevents_set_mode - set the operating mode of a clock event device
55 * @dev: device to modify
56 * @mode: new mode
57 *
58 * Must be called with interrupts disabled !
59 */
60void clockevents_set_mode(struct clock_event_device *dev,
61 enum clock_event_mode mode)
62{
63 if (dev->mode != mode) {
64 dev->set_mode(mode, dev);
65 dev->mode = mode;
66 }
67}
68
69/**
70 * clockevents_program_event - Reprogram the clock event device.
71 * @expires: absolute expiry time (monotonic clock)
72 *
73 * Returns 0 on success, -ETIME when the event is in the past.
74 */
75int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
76 ktime_t now)
77{
78 unsigned long long clc;
79 int64_t delta;
80
81 delta = ktime_to_ns(ktime_sub(expires, now));
82
83 if (delta <= 0)
84 return -ETIME;
85
86 dev->next_event = expires;
87
88 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
89 return 0;
90
91 if (delta > dev->max_delta_ns)
92 delta = dev->max_delta_ns;
93 if (delta < dev->min_delta_ns)
94 delta = dev->min_delta_ns;
95
96 clc = delta * dev->mult;
97 clc >>= dev->shift;
98
99 return dev->set_next_event((unsigned long) clc, dev);
100}
101
102/**
103 * clockevents_register_notifier - register a clock events change listener
104 */
105int clockevents_register_notifier(struct notifier_block *nb)
106{
107 int ret;
108
109 spin_lock(&clockevents_lock);
110 ret = raw_notifier_chain_register(&clockevents_chain, nb);
111 spin_unlock(&clockevents_lock);
112
113 return ret;
114}
115
116/**
117 * clockevents_unregister_notifier - unregister a clock events change listener
118 */
119void clockevents_unregister_notifier(struct notifier_block *nb)
120{
121 spin_lock(&clockevents_lock);
122 raw_notifier_chain_unregister(&clockevents_chain, nb);
123 spin_unlock(&clockevents_lock);
124}
125
126/*
127 * Notify about a clock event change. Called with clockevents_lock
128 * held.
129 */
130static void clockevents_do_notify(unsigned long reason, void *dev)
131{
132 raw_notifier_call_chain(&clockevents_chain, reason, dev);
133}
134
135/*
136 * Called after a notify add to make devices availble which were
137 * released from the notifier call.
138 */
139static void clockevents_notify_released(void)
140{
141 struct clock_event_device *dev;
142
143 while (!list_empty(&clockevents_released)) {
144 dev = list_entry(clockevents_released.next,
145 struct clock_event_device, list);
146 list_del(&dev->list);
147 list_add(&dev->list, &clockevent_devices);
148 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
149 }
150}
151
152/**
153 * clockevents_register_device - register a clock event device
154 * @dev: device to register
155 */
156void clockevents_register_device(struct clock_event_device *dev)
157{
158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159
160 spin_lock(&clockevents_lock);
161
162 list_add(&dev->list, &clockevent_devices);
163 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
164 clockevents_notify_released();
165
166 spin_unlock(&clockevents_lock);
167}
168
169/*
170 * Noop handler when we shut down an event device
171 */
172static void clockevents_handle_noop(struct clock_event_device *dev)
173{
174}
175
176/**
177 * clockevents_exchange_device - release and request clock devices
178 * @old: device to release (can be NULL)
179 * @new: device to request (can be NULL)
180 *
181 * Called from the notifier chain. clockevents_lock is held already
182 */
183void clockevents_exchange_device(struct clock_event_device *old,
184 struct clock_event_device *new)
185{
186 unsigned long flags;
187
188 local_irq_save(flags);
189 /*
190 * Caller releases a clock event device. We queue it into the
191 * released list and do a notify add later.
192 */
193 if (old) {
194 old->event_handler = clockevents_handle_noop;
195 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
196 list_del(&old->list);
197 list_add(&old->list, &clockevents_released);
198 }
199
200 if (new) {
201 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
202 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
203 }
204 local_irq_restore(flags);
205}
206
207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events
250 */
251void clockevents_notify(unsigned long reason, void *arg)
252{
253 spin_lock(&clockevents_lock);
254 clockevents_do_notify(reason, arg);
255
256 switch (reason) {
257 case CLOCK_EVT_NOTIFY_CPU_DEAD:
258 /*
259 * Unregister the clock event devices which were
260 * released from the users in the notify chain.
261 */
262 while (!list_empty(&clockevents_released)) {
263 struct clock_event_device *dev;
264
265 dev = list_entry(clockevents_released.next,
266 struct clock_event_device, list);
267 list_del(&dev->list);
268 }
269 break;
270 default:
271 break;
272 }
273 spin_unlock(&clockevents_lock);
274}
275EXPORT_SYMBOL_GPL(clockevents_notify);
276
277#ifdef CONFIG_SYSFS
278
279/**
280 * clockevents_show_registered - sysfs interface for listing clockevents
281 * @dev: unused
282 * @buf: char buffer to be filled with clock events list
283 *
284 * Provides sysfs interface for listing registered clock event devices
285 */
286static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
287{
288 struct list_head *tmp;
289 char *p = buf;
290 int cpu;
291
292 spin_lock(&clockevents_lock);
293
294 list_for_each(tmp, &clockevent_devices) {
295 struct clock_event_device *ce;
296
297 ce = list_entry(tmp, struct clock_event_device, list);
298 p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
299 ce->features, ce->mode);
300 p += sprintf(p, " C:");
301 if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
302 for_each_cpu_mask(cpu, ce->cpumask)
303 p += sprintf(p, " %d", cpu);
304 } else {
305 /*
306 * FIXME: Add the cpu which is handling this sucker
307 */
308 }
309 p += sprintf(p, "\n");
310 }
311
312 spin_unlock(&clockevents_lock);
313
314 return p - buf;
315}
316
317/*
318 * Sysfs setup bits:
319 */
320static SYSDEV_ATTR(registered, 0600,
321 clockevents_show_registered, NULL);
322
323static struct sysdev_class clockevents_sysclass = {
324 set_kset_name("clockevents"),
325};
326
327static struct sys_device clockevents_sys_device = {
328 .id = 0,
329 .cls = &clockevents_sysclass,
330};
331
332static int __init clockevents_sysfs_init(void)
333{
334 int error = sysdev_class_register(&clockevents_sysclass);
335
336 if (!error)
337 error = sysdev_register(&clockevents_sys_device);
338 if (!error)
339 error = sysdev_create_file(
340 &clockevents_sys_device,
341 &attr_registered);
342 return error;
343}
344device_initcall(clockevents_sysfs_init);
345#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9ef176c4e09..193a0793af95 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -29,6 +29,7 @@
29#include <linux/init.h> 29#include <linux/init.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h>
32 33
33/* XXX - Would like a better way for initializing curr_clocksource */ 34/* XXX - Would like a better way for initializing curr_clocksource */
34extern struct clocksource clocksource_jiffies; 35extern struct clocksource clocksource_jiffies;
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
48 */ 49 */
49static struct clocksource *curr_clocksource = &clocksource_jiffies; 50static struct clocksource *curr_clocksource = &clocksource_jiffies;
50static struct clocksource *next_clocksource; 51static struct clocksource *next_clocksource;
52static struct clocksource *clocksource_override;
51static LIST_HEAD(clocksource_list); 53static LIST_HEAD(clocksource_list);
52static DEFINE_SPINLOCK(clocksource_lock); 54static DEFINE_SPINLOCK(clocksource_lock);
53static char override_name[32]; 55static char override_name[32];
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void)
62 finished_booting = 1; 64 finished_booting = 1;
63 return 0; 65 return 0;
64} 66}
65
66late_initcall(clocksource_done_booting); 67late_initcall(clocksource_done_booting);
67 68
69#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
70static LIST_HEAD(watchdog_list);
71static struct clocksource *watchdog;
72static struct timer_list watchdog_timer;
73static DEFINE_SPINLOCK(watchdog_lock);
74static cycle_t watchdog_last;
75/*
76 * Interval: 0.5sec Treshold: 0.0625s
77 */
78#define WATCHDOG_INTERVAL (HZ >> 1)
79#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
80
81static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
82{
83 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
84 return;
85
86 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
87 cs->name, delta);
88 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
89 clocksource_change_rating(cs, 0);
90 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
91 list_del(&cs->wd_list);
92}
93
94static void clocksource_watchdog(unsigned long data)
95{
96 struct clocksource *cs, *tmp;
97 cycle_t csnow, wdnow;
98 int64_t wd_nsec, cs_nsec;
99
100 spin_lock(&watchdog_lock);
101
102 wdnow = watchdog->read();
103 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
104 watchdog_last = wdnow;
105
106 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
107 csnow = cs->read();
108 /* Initialized ? */
109 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
110 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
111 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
112 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
113 /*
114 * We just marked the clocksource as
115 * highres-capable, notify the rest of the
116 * system as well so that we transition
117 * into high-res mode:
118 */
119 tick_clock_notify();
120 }
121 cs->flags |= CLOCK_SOURCE_WATCHDOG;
122 cs->wd_last = csnow;
123 } else {
124 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
125 cs->wd_last = csnow;
126 /* Check the delta. Might remove from the list ! */
127 clocksource_ratewd(cs, cs_nsec - wd_nsec);
128 }
129 }
130
131 if (!list_empty(&watchdog_list)) {
132 __mod_timer(&watchdog_timer,
133 watchdog_timer.expires + WATCHDOG_INTERVAL);
134 }
135 spin_unlock(&watchdog_lock);
136}
137static void clocksource_check_watchdog(struct clocksource *cs)
138{
139 struct clocksource *cse;
140 unsigned long flags;
141
142 spin_lock_irqsave(&watchdog_lock, flags);
143 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
144 int started = !list_empty(&watchdog_list);
145
146 list_add(&cs->wd_list, &watchdog_list);
147 if (!started && watchdog) {
148 watchdog_last = watchdog->read();
149 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
150 add_timer(&watchdog_timer);
151 }
152 } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
153 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
154
155 if (!watchdog || cs->rating > watchdog->rating) {
156 if (watchdog)
157 del_timer(&watchdog_timer);
158 watchdog = cs;
159 init_timer(&watchdog_timer);
160 watchdog_timer.function = clocksource_watchdog;
161
162 /* Reset watchdog cycles */
163 list_for_each_entry(cse, &watchdog_list, wd_list)
164 cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
165 /* Start if list is not empty */
166 if (!list_empty(&watchdog_list)) {
167 watchdog_last = watchdog->read();
168 watchdog_timer.expires =
169 jiffies + WATCHDOG_INTERVAL;
170 add_timer(&watchdog_timer);
171 }
172 }
173 }
174 spin_unlock_irqrestore(&watchdog_lock, flags);
175}
176#else
177static void clocksource_check_watchdog(struct clocksource *cs)
178{
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
180 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
181}
182#endif
183
68/** 184/**
69 * clocksource_get_next - Returns the selected clocksource 185 * clocksource_get_next - Returns the selected clocksource
70 * 186 *
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
84} 200}
85 201
86/** 202/**
87 * select_clocksource - Finds the best registered clocksource. 203 * select_clocksource - Selects the best registered clocksource.
88 * 204 *
89 * Private function. Must hold clocksource_lock when called. 205 * Private function. Must hold clocksource_lock when called.
90 * 206 *
91 * Looks through the list of registered clocksources, returning 207 * Select the clocksource with the best rating, or the clocksource,
92 * the one with the highest rating value. If there is a clocksource 208 * which is selected by userspace override.
93 * name that matches the override string, it returns that clocksource.
94 */ 209 */
95static struct clocksource *select_clocksource(void) 210static struct clocksource *select_clocksource(void)
96{ 211{
97 struct clocksource *best = NULL; 212 struct clocksource *next;
98 struct list_head *tmp;
99 213
100 list_for_each(tmp, &clocksource_list) { 214 if (list_empty(&clocksource_list))
101 struct clocksource *src; 215 return NULL;
102 216
103 src = list_entry(tmp, struct clocksource, list); 217 if (clocksource_override)
104 if (!best) 218 next = clocksource_override;
105 best = src; 219 else
106 220 next = list_entry(clocksource_list.next, struct clocksource,
107 /* check for override: */ 221 list);
108 if (strlen(src->name) == strlen(override_name) && 222
109 !strcmp(src->name, override_name)) { 223 if (next == curr_clocksource)
110 best = src; 224 return NULL;
111 break;
112 }
113 /* pick the highest rating: */
114 if (src->rating > best->rating)
115 best = src;
116 }
117 225
118 return best; 226 return next;
119} 227}
120 228
121/** 229/*
122 * is_registered_source - Checks if clocksource is registered 230 * Enqueue the clocksource sorted by rating
123 * @c: pointer to a clocksource
124 *
125 * Private helper function. Must hold clocksource_lock when called.
126 *
127 * Returns one if the clocksource is already registered, zero otherwise.
128 */ 231 */
129static int is_registered_source(struct clocksource *c) 232static int clocksource_enqueue(struct clocksource *c)
130{ 233{
131 int len = strlen(c->name); 234 struct list_head *tmp, *entry = &clocksource_list;
132 struct list_head *tmp;
133 235
134 list_for_each(tmp, &clocksource_list) { 236 list_for_each(tmp, &clocksource_list) {
135 struct clocksource *src; 237 struct clocksource *cs;
136 238
137 src = list_entry(tmp, struct clocksource, list); 239 cs = list_entry(tmp, struct clocksource, list);
138 if (strlen(src->name) == len && !strcmp(src->name, c->name)) 240 if (cs == c)
139 return 1; 241 return -EBUSY;
242 /* Keep track of the place, where to insert */
243 if (cs->rating >= c->rating)
244 entry = tmp;
140 } 245 }
246 list_add(&c->list, entry);
247
248 if (strlen(c->name) == strlen(override_name) &&
249 !strcmp(c->name, override_name))
250 clocksource_override = c;
141 251
142 return 0; 252 return 0;
143} 253}
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
150 */ 260 */
151int clocksource_register(struct clocksource *c) 261int clocksource_register(struct clocksource *c)
152{ 262{
153 int ret = 0;
154 unsigned long flags; 263 unsigned long flags;
264 int ret;
155 265
156 spin_lock_irqsave(&clocksource_lock, flags); 266 spin_lock_irqsave(&clocksource_lock, flags);
157 /* check if clocksource is already registered */ 267 ret = clocksource_enqueue(c);
158 if (is_registered_source(c)) { 268 if (!ret)
159 printk("register_clocksource: Cannot register %s. "
160 "Already registered!", c->name);
161 ret = -EBUSY;
162 } else {
163 /* register it */
164 list_add(&c->list, &clocksource_list);
165 /* scan the registered clocksources, and pick the best one */
166 next_clocksource = select_clocksource(); 269 next_clocksource = select_clocksource();
167 }
168 spin_unlock_irqrestore(&clocksource_lock, flags); 270 spin_unlock_irqrestore(&clocksource_lock, flags);
271 if (!ret)
272 clocksource_check_watchdog(c);
169 return ret; 273 return ret;
170} 274}
171EXPORT_SYMBOL(clocksource_register); 275EXPORT_SYMBOL(clocksource_register);
172 276
173/** 277/**
174 * clocksource_reselect - Rescan list for next clocksource 278 * clocksource_change_rating - Change the rating of a registered clocksource
175 * 279 *
176 * A quick helper function to be used if a clocksource changes its
177 * rating. Forces the clocksource list to be re-scanned for the best
178 * clocksource.
179 */ 280 */
180void clocksource_reselect(void) 281void clocksource_change_rating(struct clocksource *cs, int rating)
181{ 282{
182 unsigned long flags; 283 unsigned long flags;
183 284
184 spin_lock_irqsave(&clocksource_lock, flags); 285 spin_lock_irqsave(&clocksource_lock, flags);
286 list_del(&cs->list);
287 cs->rating = rating;
288 clocksource_enqueue(cs);
185 next_clocksource = select_clocksource(); 289 next_clocksource = select_clocksource();
186 spin_unlock_irqrestore(&clocksource_lock, flags); 290 spin_unlock_irqrestore(&clocksource_lock, flags);
187} 291}
188EXPORT_SYMBOL(clocksource_reselect);
189 292
190#ifdef CONFIG_SYSFS 293#ifdef CONFIG_SYSFS
191/** 294/**
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
221static ssize_t sysfs_override_clocksource(struct sys_device *dev, 324static ssize_t sysfs_override_clocksource(struct sys_device *dev,
222 const char *buf, size_t count) 325 const char *buf, size_t count)
223{ 326{
327 struct clocksource *ovr = NULL;
328 struct list_head *tmp;
224 size_t ret = count; 329 size_t ret = count;
330 int len;
331
225 /* strings from sysfs write are not 0 terminated! */ 332 /* strings from sysfs write are not 0 terminated! */
226 if (count >= sizeof(override_name)) 333 if (count >= sizeof(override_name))
227 return -EINVAL; 334 return -EINVAL;
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
229 /* strip of \n: */ 336 /* strip of \n: */
230 if (buf[count-1] == '\n') 337 if (buf[count-1] == '\n')
231 count--; 338 count--;
232 if (count < 1)
233 return -EINVAL;
234 339
235 spin_lock_irq(&clocksource_lock); 340 spin_lock_irq(&clocksource_lock);
236 341
237 /* copy the name given: */ 342 if (count > 0)
238 memcpy(override_name, buf, count); 343 memcpy(override_name, buf, count);
239 override_name[count] = 0; 344 override_name[count] = 0;
240 345
241 /* try to select it: */ 346 len = strlen(override_name);
242 next_clocksource = select_clocksource(); 347 if (len) {
348 ovr = clocksource_override;
349 /* try to select it: */
350 list_for_each(tmp, &clocksource_list) {
351 struct clocksource *cs;
352
353 cs = list_entry(tmp, struct clocksource, list);
354 if (strlen(cs->name) == len &&
355 !strcmp(cs->name, override_name))
356 ovr = cs;
357 }
358 }
359
360 /* Reselect, when the override name has changed */
361 if (ovr != clocksource_override) {
362 clocksource_override = ovr;
363 next_clocksource = select_clocksource();
364 }
243 365
244 spin_unlock_irq(&clocksource_lock); 366 spin_unlock_irq(&clocksource_lock);
245 367
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a07..3be8da8fed7e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66}; 65};
67 66
68static int __init init_jiffies_clocksource(void) 67static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f9..eb12509e00bd 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
24 24
25#define MAX_TICKADJ 500 /* microsecs */ 25#define MAX_TICKADJ 500 /* microsecs */
26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
27 TICK_LENGTH_SHIFT) / HZ) 27 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
28 28
29/* 29/*
30 * phase-lock loop variables 30 * phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
46 46
47static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
48{ 48{
49 tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
51 tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 51 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 do_div(tick_length_base, HZ); 54 tick_length_base = second_length;
54 55
55 tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; 56 do_div(second_length, HZ);
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT;
58
59 do_div(tick_length_base, NTP_INTERVAL_FREQ);
56} 60}
57 61
58/** 62/**
@@ -162,7 +166,7 @@ void second_overflow(void)
162 tick_length -= MAX_TICKADJ_SCALED; 166 tick_length -= MAX_TICKADJ_SCALED;
163 } else { 167 } else {
164 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 168 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
165 HZ) << TICK_LENGTH_SHIFT; 169 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
166 time_adjust = 0; 170 time_adjust = 0;
167 } 171 }
168 } 172 }
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
239 result = -EINVAL; 243 result = -EINVAL;
240 goto leave; 244 goto leave;
241 } 245 }
242 time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); 246 time_freq = ((s64)txc->freq * NSEC_PER_USEC)
247 >> (SHIFT_USEC - SHIFT_NSEC);
243 } 248 }
244 249
245 if (txc->modes & ADJ_MAXERROR) { 250 if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
309 freq_adj += time_freq; 314 freq_adj += time_freq;
310 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); 315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
311 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); 316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
312 time_offset = (time_offset / HZ) << SHIFT_UPDATE; 317 time_offset = (time_offset / NTP_INTERVAL_FREQ)
318 << SHIFT_UPDATE;
313 } /* STA_PLL */ 319 } /* STA_PLL */
314 } /* txc->modes & ADJ_OFFSET */ 320 } /* txc->modes & ADJ_OFFSET */
315 if (txc->modes & ADJ_TICK) 321 if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
324 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 330 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
325 txc->offset = save_adjust; 331 txc->offset = save_adjust;
326 else 332 else
327 txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; 333 txc->offset = shift_right(time_offset, SHIFT_UPDATE)
328 txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); 334 * NTP_INTERVAL_FREQ / 1000;
335 txc->freq = (time_freq / NSEC_PER_USEC)
336 << (SHIFT_USEC - SHIFT_NSEC);
329 txc->maxerror = time_maxerror; 337 txc->maxerror = time_maxerror;
330 txc->esterror = time_esterror; 338 txc->esterror = time_esterror;
331 txc->status = time_status; 339 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000000..12b3efeb9f6f
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
1/*
2 * linux/kernel/time/tick-broadcast.c
3 *
4 * This file contains functions which emulate a local clock-event
5 * device via a broadcast event source.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Broadcast support for broken x86 hardware, where the local apic
27 * timer stops in C3 state.
28 */
29
30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock);
33
34/*
35 * Debugging: see timer_list.c
36 */
37struct tick_device *tick_get_broadcast_device(void)
38{
39 return &tick_broadcast_device;
40}
41
42cpumask_t *tick_get_broadcast_mask(void)
43{
44 return &tick_broadcast_mask;
45}
46
47/*
48 * Start the device in periodic mode
49 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
53 tick_setup_periodic(bc, 1);
54}
55
56/*
57 * Check, if the device can be utilized as broadcast device:
58 */
59int tick_check_broadcast_device(struct clock_event_device *dev)
60{
61 if (tick_broadcast_device.evtdev ||
62 (dev->features & CLOCK_EVT_FEAT_C3STOP))
63 return 0;
64
65 clockevents_exchange_device(NULL, dev);
66 tick_broadcast_device.evtdev = dev;
67 if (!cpus_empty(tick_broadcast_mask))
68 tick_broadcast_start_periodic(dev);
69 return 1;
70}
71
72/*
73 * Check, if the device is the broadcast device
74 */
75int tick_is_broadcast_device(struct clock_event_device *dev)
76{
77 return (dev && tick_broadcast_device.evtdev == dev);
78}
79
80/*
81 * Check, if the device is disfunctional and a place holder, which
82 * needs to be handled by the broadcast device.
83 */
84int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
85{
86 unsigned long flags;
87 int ret = 0;
88
89 spin_lock_irqsave(&tick_broadcast_lock, flags);
90
91 /*
92 * Devices might be registered with both periodic and oneshot
93 * mode disabled. This signals, that the device needs to be
94 * operated from the broadcast device and is a placeholder for
95 * the cpu local device.
96 */
97 if (!tick_device_is_functional(dev)) {
98 dev->event_handler = tick_handle_periodic;
99 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1;
102 }
103
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret;
106}
107
108/*
109 * Broadcast the event to the cpus, which are set in the mask
110 */
111int tick_do_broadcast(cpumask_t mask)
112{
113 int ret = 0, cpu = smp_processor_id();
114 struct tick_device *td;
115
116 /*
117 * Check, if the current cpu is in the mask
118 */
119 if (cpu_isset(cpu, mask)) {
120 cpu_clear(cpu, mask);
121 td = &per_cpu(tick_cpu_device, cpu);
122 td->evtdev->event_handler(td->evtdev);
123 ret = 1;
124 }
125
126 if (!cpus_empty(mask)) {
127 /*
128 * It might be necessary to actually check whether the devices
129 * have different broadcast functions. For now, just use the
130 * one of the first device. This works as long as we have this
131 * misfeature only on x86 (lapic)
132 */
133 cpu = first_cpu(mask);
134 td = &per_cpu(tick_cpu_device, cpu);
135 td->evtdev->broadcast(mask);
136 ret = 1;
137 }
138 return ret;
139}
140
141/*
142 * Periodic broadcast:
143 * - invoke the broadcast handlers
144 */
145static void tick_do_periodic_broadcast(void)
146{
147 cpumask_t mask;
148
149 spin_lock(&tick_broadcast_lock);
150
151 cpus_and(mask, cpu_online_map, tick_broadcast_mask);
152 tick_do_broadcast(mask);
153
154 spin_unlock(&tick_broadcast_lock);
155}
156
157/*
158 * Event handler for periodic broadcast ticks
159 */
160static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
161{
162 dev->next_event.tv64 = KTIME_MAX;
163
164 tick_do_periodic_broadcast();
165
166 /*
167 * The device is in periodic mode. No reprogramming necessary:
168 */
169 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
170 return;
171
172 /*
173 * Setup the next period for devices, which do not have
174 * periodic mode:
175 */
176 for (;;) {
177 ktime_t next = ktime_add(dev->next_event, tick_period);
178
179 if (!clockevents_program_event(dev, next, ktime_get()))
180 return;
181 tick_do_periodic_broadcast();
182 }
183}
184
185/*
186 * Powerstate information: The system enters/leaves a state, where
187 * affected devices might stop
188 */
189static void tick_do_broadcast_on_off(void *why)
190{
191 struct clock_event_device *bc, *dev;
192 struct tick_device *td;
193 unsigned long flags, *reason = why;
194 int cpu;
195
196 spin_lock_irqsave(&tick_broadcast_lock, flags);
197
198 cpu = smp_processor_id();
199 td = &per_cpu(tick_cpu_device, cpu);
200 dev = td->evtdev;
201 bc = tick_broadcast_device.evtdev;
202
203 /*
204 * Is the device in broadcast mode forever or is it not
205 * affected by the powerstate ?
206 */
207 if (!dev || !tick_device_is_functional(dev) ||
208 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
209 goto out;
210
211 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
212 if (!cpu_isset(cpu, tick_broadcast_mask)) {
213 cpu_set(cpu, tick_broadcast_mask);
214 if (td->mode == TICKDEV_MODE_PERIODIC)
215 clockevents_set_mode(dev,
216 CLOCK_EVT_MODE_SHUTDOWN);
217 }
218 } else {
219 if (cpu_isset(cpu, tick_broadcast_mask)) {
220 cpu_clear(cpu, tick_broadcast_mask);
221 if (td->mode == TICKDEV_MODE_PERIODIC)
222 tick_setup_periodic(dev, 0);
223 }
224 }
225
226 if (cpus_empty(tick_broadcast_mask))
227 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
228 else {
229 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
230 tick_broadcast_start_periodic(bc);
231 else
232 tick_broadcast_setup_oneshot(bc);
233 }
234out:
235 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
236}
237
238/*
239 * Powerstate information: The system enters/leaves a state, where
240 * affected devices might stop.
241 */
242void tick_broadcast_on_off(unsigned long reason, int *oncpu)
243{
244 int cpu = get_cpu();
245
246 if (cpu == *oncpu)
247 tick_do_broadcast_on_off(&reason);
248 else
249 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
250 &reason, 1, 1);
251 put_cpu();
252}
253
254/*
255 * Set the periodic handler depending on broadcast on/off
256 */
257void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
258{
259 if (!broadcast)
260 dev->event_handler = tick_handle_periodic;
261 else
262 dev->event_handler = tick_handle_periodic_broadcast;
263}
264
265/*
266 * Remove a CPU from broadcasting
267 */
268void tick_shutdown_broadcast(unsigned int *cpup)
269{
270 struct clock_event_device *bc;
271 unsigned long flags;
272 unsigned int cpu = *cpup;
273
274 spin_lock_irqsave(&tick_broadcast_lock, flags);
275
276 bc = tick_broadcast_device.evtdev;
277 cpu_clear(cpu, tick_broadcast_mask);
278
279 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
280 if (bc && cpus_empty(tick_broadcast_mask))
281 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
282 }
283
284 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
285}
286
287#ifdef CONFIG_TICK_ONESHOT
288
289static cpumask_t tick_broadcast_oneshot_mask;
290
291/*
292 * Debugging: see timer_list.c
293 */
294cpumask_t *tick_get_broadcast_oneshot_mask(void)
295{
296 return &tick_broadcast_oneshot_mask;
297}
298
299static int tick_broadcast_set_event(ktime_t expires, int force)
300{
301 struct clock_event_device *bc = tick_broadcast_device.evtdev;
302 ktime_t now = ktime_get();
303 int res;
304
305 for(;;) {
306 res = clockevents_program_event(bc, expires, now);
307 if (!res || !force)
308 return res;
309 now = ktime_get();
310 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
311 }
312}
313
314/*
315 * Reprogram the broadcast device:
316 *
317 * Called with tick_broadcast_lock held and interrupts disabled.
318 */
319static int tick_broadcast_reprogram(void)
320{
321 ktime_t expires = { .tv64 = KTIME_MAX };
322 struct tick_device *td;
323 int cpu;
324
325 /*
326 * Find the event which expires next:
327 */
328 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
329 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
330 td = &per_cpu(tick_cpu_device, cpu);
331 if (td->evtdev->next_event.tv64 < expires.tv64)
332 expires = td->evtdev->next_event;
333 }
334
335 if (expires.tv64 == KTIME_MAX)
336 return 0;
337
338 return tick_broadcast_set_event(expires, 0);
339}
340
341/*
342 * Handle oneshot mode broadcasting
343 */
344static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
345{
346 struct tick_device *td;
347 cpumask_t mask;
348 ktime_t now;
349 int cpu;
350
351 spin_lock(&tick_broadcast_lock);
352again:
353 dev->next_event.tv64 = KTIME_MAX;
354 mask = CPU_MASK_NONE;
355 now = ktime_get();
356 /* Find all expired events */
357 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
358 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
359 td = &per_cpu(tick_cpu_device, cpu);
360 if (td->evtdev->next_event.tv64 <= now.tv64)
361 cpu_set(cpu, mask);
362 }
363
364 /*
365 * Wakeup the cpus which have an expired event. The broadcast
366 * device is reprogrammed in the return from idle code.
367 */
368 if (!tick_do_broadcast(mask)) {
369 /*
370 * The global event did not expire any CPU local
371 * events. This happens in dyntick mode, as the
372 * maximum PIT delta is quite small.
373 */
374 if (tick_broadcast_reprogram())
375 goto again;
376 }
377 spin_unlock(&tick_broadcast_lock);
378}
379
380/*
381 * Powerstate information: The system enters/leaves a state, where
382 * affected devices might stop
383 */
384void tick_broadcast_oneshot_control(unsigned long reason)
385{
386 struct clock_event_device *bc, *dev;
387 struct tick_device *td;
388 unsigned long flags;
389 int cpu;
390
391 spin_lock_irqsave(&tick_broadcast_lock, flags);
392
393 /*
394 * Periodic mode does not care about the enter/exit of power
395 * states
396 */
397 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
398 goto out;
399
400 bc = tick_broadcast_device.evtdev;
401 cpu = smp_processor_id();
402 td = &per_cpu(tick_cpu_device, cpu);
403 dev = td->evtdev;
404
405 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
406 goto out;
407
408 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
409 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
410 cpu_set(cpu, tick_broadcast_oneshot_mask);
411 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
412 if (dev->next_event.tv64 < bc->next_event.tv64)
413 tick_broadcast_set_event(dev->next_event, 1);
414 }
415 } else {
416 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
417 cpu_clear(cpu, tick_broadcast_oneshot_mask);
418 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
419 if (dev->next_event.tv64 != KTIME_MAX)
420 tick_program_event(dev->next_event, 1);
421 }
422 }
423
424out:
425 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
426}
427
428/**
429 * tick_broadcast_setup_highres - setup the broadcast device for highres
430 */
431void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
432{
433 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
434 bc->event_handler = tick_handle_oneshot_broadcast;
435 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
436 bc->next_event.tv64 = KTIME_MAX;
437 }
438}
439
440/*
441 * Select oneshot operating mode for the broadcast device
442 */
443void tick_broadcast_switch_to_oneshot(void)
444{
445 struct clock_event_device *bc;
446 unsigned long flags;
447
448 spin_lock_irqsave(&tick_broadcast_lock, flags);
449
450 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
451 bc = tick_broadcast_device.evtdev;
452 if (bc)
453 tick_broadcast_setup_oneshot(bc);
454 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
455}
456
457
458/*
459 * Remove a dead CPU from broadcasting
460 */
461void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
462{
463 struct clock_event_device *bc;
464 unsigned long flags;
465 unsigned int cpu = *cpup;
466
467 spin_lock_irqsave(&tick_broadcast_lock, flags);
468
469 bc = tick_broadcast_device.evtdev;
470 cpu_clear(cpu, tick_broadcast_oneshot_mask);
471
472 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
473 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
474 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
475 }
476
477 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
478}
479
480#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000000..4500e347f1bb
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,346 @@
1/*
2 * linux/kernel/time/tick-common.c
3 *
4 * This file contains the base functions to manage periodic tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Tick devices
27 */
28DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
29/*
30 * Tick next event: keeps track of the tick time
31 */
32ktime_t tick_next_period;
33ktime_t tick_period;
34static int tick_do_timer_cpu = -1;
35DEFINE_SPINLOCK(tick_device_lock);
36
37/*
38 * Debugging: see timer_list.c
39 */
40struct tick_device *tick_get_device(int cpu)
41{
42 return &per_cpu(tick_cpu_device, cpu);
43}
44
45/**
46 * tick_is_oneshot_available - check for a oneshot capable event device
47 */
48int tick_is_oneshot_available(void)
49{
50 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
51
52 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
53}
54
55/*
56 * Periodic tick
57 */
58static void tick_periodic(int cpu)
59{
60 if (tick_do_timer_cpu == cpu) {
61 write_seqlock(&xtime_lock);
62
63 /* Keep track of the next tick event */
64 tick_next_period = ktime_add(tick_next_period, tick_period);
65
66 do_timer(1);
67 write_sequnlock(&xtime_lock);
68 }
69
70 update_process_times(user_mode(get_irq_regs()));
71 profile_tick(CPU_PROFILING);
72}
73
74/*
75 * Event handler for periodic ticks
76 */
77void tick_handle_periodic(struct clock_event_device *dev)
78{
79 int cpu = smp_processor_id();
80
81 tick_periodic(cpu);
82
83 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
84 return;
85 /*
86 * Setup the next period for devices, which do not have
87 * periodic mode:
88 */
89 for (;;) {
90 ktime_t next = ktime_add(dev->next_event, tick_period);
91
92 if (!clockevents_program_event(dev, next, ktime_get()))
93 return;
94 tick_periodic(cpu);
95 }
96}
97
98/*
99 * Setup the device for a periodic tick
100 */
101void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
102{
103 tick_set_periodic_handler(dev, broadcast);
104
105 /* Broadcast setup ? */
106 if (!tick_device_is_functional(dev))
107 return;
108
109 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
110 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
111 } else {
112 unsigned long seq;
113 ktime_t next;
114
115 do {
116 seq = read_seqbegin(&xtime_lock);
117 next = tick_next_period;
118 } while (read_seqretry(&xtime_lock, seq));
119
120 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
121
122 for (;;) {
123 if (!clockevents_program_event(dev, next, ktime_get()))
124 return;
125 next = ktime_add(next, tick_period);
126 }
127 }
128}
129
130/*
131 * Setup the tick device
132 */
133static void tick_setup_device(struct tick_device *td,
134 struct clock_event_device *newdev, int cpu,
135 cpumask_t cpumask)
136{
137 ktime_t next_event;
138 void (*handler)(struct clock_event_device *) = NULL;
139
140 /*
141 * First device setup ?
142 */
143 if (!td->evtdev) {
144 /*
145 * If no cpu took the do_timer update, assign it to
146 * this cpu:
147 */
148 if (tick_do_timer_cpu == -1) {
149 tick_do_timer_cpu = cpu;
150 tick_next_period = ktime_get();
151 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
152 }
153
154 /*
155 * Startup in periodic mode first.
156 */
157 td->mode = TICKDEV_MODE_PERIODIC;
158 } else {
159 handler = td->evtdev->event_handler;
160 next_event = td->evtdev->next_event;
161 }
162
163 td->evtdev = newdev;
164
165 /*
166 * When the device is not per cpu, pin the interrupt to the
167 * current cpu:
168 */
169 if (!cpus_equal(newdev->cpumask, cpumask))
170 irq_set_affinity(newdev->irq, cpumask);
171
172 /*
173 * When global broadcasting is active, check if the current
174 * device is registered as a placeholder for broadcast mode.
175 * This allows us to handle this x86 misfeature in a generic
176 * way.
177 */
178 if (tick_device_uses_broadcast(newdev, cpu))
179 return;
180
181 if (td->mode == TICKDEV_MODE_PERIODIC)
182 tick_setup_periodic(newdev, 0);
183 else
184 tick_setup_oneshot(newdev, handler, next_event);
185}
186
187/*
188 * Check, if the new registered device should be used.
189 */
190static int tick_check_new_device(struct clock_event_device *newdev)
191{
192 struct clock_event_device *curdev;
193 struct tick_device *td;
194 int cpu, ret = NOTIFY_OK;
195 unsigned long flags;
196 cpumask_t cpumask;
197
198 spin_lock_irqsave(&tick_device_lock, flags);
199
200 cpu = smp_processor_id();
201 if (!cpu_isset(cpu, newdev->cpumask))
202 goto out;
203
204 td = &per_cpu(tick_cpu_device, cpu);
205 curdev = td->evtdev;
206 cpumask = cpumask_of_cpu(cpu);
207
208 /* cpu local device ? */
209 if (!cpus_equal(newdev->cpumask, cpumask)) {
210
211 /*
212 * If the cpu affinity of the device interrupt can not
213 * be set, ignore it.
214 */
215 if (!irq_can_set_affinity(newdev->irq))
216 goto out_bc;
217
218 /*
219 * If we have a cpu local device already, do not replace it
220 * by a non cpu local device
221 */
222 if (curdev && cpus_equal(curdev->cpumask, cpumask))
223 goto out_bc;
224 }
225
226 /*
227 * If we have an active device, then check the rating and the oneshot
228 * feature.
229 */
230 if (curdev) {
231 /*
232 * Prefer one shot capable devices !
233 */
234 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
235 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
236 goto out_bc;
237 /*
238 * Check the rating
239 */
240 if (curdev->rating >= newdev->rating)
241 goto out_bc;
242 }
243
244 /*
245 * Replace the eventually existing device by the new
246 * device. If the current device is the broadcast device, do
247 * not give it back to the clockevents layer !
248 */
249 if (tick_is_broadcast_device(curdev)) {
250 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
251 curdev = NULL;
252 }
253 clockevents_exchange_device(curdev, newdev);
254 tick_setup_device(td, newdev, cpu, cpumask);
255 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
256 tick_oneshot_notify();
257
258 spin_unlock_irqrestore(&tick_device_lock, flags);
259 return NOTIFY_STOP;
260
261out_bc:
262 /*
263 * Can the new device be used as a broadcast device ?
264 */
265 if (tick_check_broadcast_device(newdev))
266 ret = NOTIFY_STOP;
267out:
268 spin_unlock_irqrestore(&tick_device_lock, flags);
269
270 return ret;
271}
272
273/*
274 * Shutdown an event device on a given cpu:
275 *
276 * This is called on a life CPU, when a CPU is dead. So we cannot
277 * access the hardware device itself.
278 * We just set the mode and remove it from the lists.
279 */
280static void tick_shutdown(unsigned int *cpup)
281{
282 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
283 struct clock_event_device *dev = td->evtdev;
284 unsigned long flags;
285
286 spin_lock_irqsave(&tick_device_lock, flags);
287 td->mode = TICKDEV_MODE_PERIODIC;
288 if (dev) {
289 /*
290 * Prevent that the clock events layer tries to call
291 * the set mode function!
292 */
293 dev->mode = CLOCK_EVT_MODE_UNUSED;
294 clockevents_exchange_device(dev, NULL);
295 td->evtdev = NULL;
296 }
297 spin_unlock_irqrestore(&tick_device_lock, flags);
298}
299
300/*
301 * Notification about clock event devices
302 */
303static int tick_notify(struct notifier_block *nb, unsigned long reason,
304 void *dev)
305{
306 switch (reason) {
307
308 case CLOCK_EVT_NOTIFY_ADD:
309 return tick_check_new_device(dev);
310
311 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
312 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
313 tick_broadcast_on_off(reason, dev);
314 break;
315
316 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
317 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
318 tick_broadcast_oneshot_control(reason);
319 break;
320
321 case CLOCK_EVT_NOTIFY_CPU_DEAD:
322 tick_shutdown_broadcast_oneshot(dev);
323 tick_shutdown_broadcast(dev);
324 tick_shutdown(dev);
325 break;
326
327 default:
328 break;
329 }
330
331 return NOTIFY_OK;
332}
333
334static struct notifier_block tick_notifier = {
335 .notifier_call = tick_notify,
336};
337
338/**
339 * tick_init - initialize the tick control
340 *
341 * Register the notifier with the clockevents framework
342 */
343void __init tick_init(void)
344{
345 clockevents_register_notifier(&tick_notifier);
346}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000000..54861a0f29ff
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
1/*
2 * tick internal variable and functions used by low/high res code
3 */
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period;
7extern ktime_t tick_period;
8
9extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
10extern void tick_handle_periodic(struct clock_event_device *dev);
11
12/*
13 * NO_HZ / high resolution timer shared code
14 */
15#ifdef CONFIG_TICK_ONESHOT
16extern void tick_setup_oneshot(struct clock_event_device *newdev,
17 void (*handler)(struct clock_event_device *),
18 ktime_t nextevt);
19extern int tick_program_event(ktime_t expires, int force);
20extern void tick_oneshot_notify(void);
21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
22
23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
25extern void tick_broadcast_oneshot_control(unsigned long reason);
26extern void tick_broadcast_switch_to_oneshot(void);
27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
28# else /* BROADCAST */
29static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
30{
31 BUG();
32}
33static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
34static inline void tick_broadcast_switch_to_oneshot(void) { }
35static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
36# endif /* !BROADCAST */
37
38#else /* !ONESHOT */
39static inline
40void tick_setup_oneshot(struct clock_event_device *newdev,
41 void (*handler)(struct clock_event_device *),
42 ktime_t nextevt)
43{
44 BUG();
45}
46static inline int tick_program_event(ktime_t expires, int force)
47{
48 return 0;
49}
50static inline void tick_oneshot_notify(void) { }
51static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
52{
53 BUG();
54}
55static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
56static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
57#endif /* !TICK_ONESHOT */
58
59/*
60 * Broadcasting support
61 */
62#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
63extern int tick_do_broadcast(cpumask_t mask);
64
65extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
66extern int tick_check_broadcast_device(struct clock_event_device *dev);
67extern int tick_is_broadcast_device(struct clock_event_device *dev);
68extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
69extern void tick_shutdown_broadcast(unsigned int *cpup);
70
71extern void
72tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
73
74#else /* !BROADCAST */
75
76static inline int tick_check_broadcast_device(struct clock_event_device *dev)
77{
78 return 0;
79}
80
81static inline int tick_is_broadcast_device(struct clock_event_device *dev)
82{
83 return 0;
84}
85static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
86 int cpu)
87{
88 return 0;
89}
90static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
91static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
92static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
93
94/*
95 * Set the periodic handler in non broadcast mode
96 */
97static inline void tick_set_periodic_handler(struct clock_event_device *dev,
98 int broadcast)
99{
100 dev->event_handler = tick_handle_periodic;
101}
102#endif /* !BROADCAST */
103
104/*
105 * Check, if the device is functional or a dummy for broadcast
106 */
107static inline int tick_device_is_functional(struct clock_event_device *dev)
108{
109 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
110}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000000..2e8b7ff863cc
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
1/*
2 * linux/kernel/time/tick-oneshot.c
3 *
4 * This file contains functions which manage high resolution tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/**
26 * tick_program_event
27 */
28int tick_program_event(ktime_t expires, int force)
29{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get();
32
33 while (1) {
34 int ret = clockevents_program_event(dev, expires, now);
35
36 if (!ret || !force)
37 return ret;
38 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
40 }
41}
42
43/**
44 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
45 */
46void tick_setup_oneshot(struct clock_event_device *newdev,
47 void (*handler)(struct clock_event_device *),
48 ktime_t next_event)
49{
50 newdev->event_handler = handler;
51 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
52 clockevents_program_event(newdev, next_event, ktime_get());
53}
54
55/**
56 * tick_switch_to_oneshot - switch to oneshot mode
57 */
58int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
59{
60 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
61 struct clock_event_device *dev = td->evtdev;
62
63 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
64 !tick_device_is_functional(dev))
65 return -EINVAL;
66
67 td->mode = TICKDEV_MODE_ONESHOT;
68 dev->event_handler = handler;
69 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
70 tick_broadcast_switch_to_oneshot();
71 return 0;
72}
73
74#ifdef CONFIG_HIGH_RES_TIMERS
75/**
76 * tick_init_highres - switch to high resolution mode
77 *
78 * Called with interrupts disabled.
79 */
80int tick_init_highres(void)
81{
82 return tick_switch_to_oneshot(hrtimer_interrupt);
83}
84#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000000..95e41f7f850b
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,563 @@
1/*
2 * linux/kernel/time/tick-sched.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * No idle tick implementation for low and high resolution timers
9 *
10 * Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/interrupt.h>
18#include <linux/kernel_stat.h>
19#include <linux/percpu.h>
20#include <linux/profile.h>
21#include <linux/sched.h>
22#include <linux/tick.h>
23
24#include "tick-internal.h"
25
26/*
27 * Per cpu nohz control structure
28 */
29static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
30
31/*
32 * The time, when the last jiffy update happened. Protected by xtime_lock.
33 */
34static ktime_t last_jiffies_update;
35
36struct tick_sched *tick_get_tick_sched(int cpu)
37{
38 return &per_cpu(tick_cpu_sched, cpu);
39}
40
41/*
42 * Must be called with interrupts disabled !
43 */
44static void tick_do_update_jiffies64(ktime_t now)
45{
46 unsigned long ticks = 0;
47 ktime_t delta;
48
49 /* Reevalute with xtime_lock held */
50 write_seqlock(&xtime_lock);
51
52 delta = ktime_sub(now, last_jiffies_update);
53 if (delta.tv64 >= tick_period.tv64) {
54
55 delta = ktime_sub(delta, tick_period);
56 last_jiffies_update = ktime_add(last_jiffies_update,
57 tick_period);
58
59 /* Slow path for long timeouts */
60 if (unlikely(delta.tv64 >= tick_period.tv64)) {
61 s64 incr = ktime_to_ns(tick_period);
62
63 ticks = ktime_divns(delta, incr);
64
65 last_jiffies_update = ktime_add_ns(last_jiffies_update,
66 incr * ticks);
67 }
68 do_timer(++ticks);
69 }
70 write_sequnlock(&xtime_lock);
71}
72
73/*
74 * Initialize and return retrieve the jiffies update.
75 */
76static ktime_t tick_init_jiffy_update(void)
77{
78 ktime_t period;
79
80 write_seqlock(&xtime_lock);
81 /* Did we start the jiffies update yet ? */
82 if (last_jiffies_update.tv64 == 0)
83 last_jiffies_update = tick_next_period;
84 period = last_jiffies_update;
85 write_sequnlock(&xtime_lock);
86 return period;
87}
88
89/*
90 * NOHZ - aka dynamic tick functionality
91 */
92#ifdef CONFIG_NO_HZ
93/*
94 * NO HZ enabled ?
95 */
96static int tick_nohz_enabled __read_mostly = 1;
97
98/*
99 * Enable / Disable tickless mode
100 */
101static int __init setup_tick_nohz(char *str)
102{
103 if (!strcmp(str, "off"))
104 tick_nohz_enabled = 0;
105 else if (!strcmp(str, "on"))
106 tick_nohz_enabled = 1;
107 else
108 return 0;
109 return 1;
110}
111
112__setup("nohz=", setup_tick_nohz);
113
114/**
115 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
116 *
117 * Called from interrupt entry when the CPU was idle
118 *
119 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
120 * must be updated. Otherwise an interrupt handler could use a stale jiffy
121 * value. We do this unconditionally on any cpu, as we don't know whether the
122 * cpu, which has the update task assigned is in a long sleep.
123 */
124void tick_nohz_update_jiffies(void)
125{
126 int cpu = smp_processor_id();
127 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
128 unsigned long flags;
129 ktime_t now;
130
131 if (!ts->tick_stopped)
132 return;
133
134 cpu_clear(cpu, nohz_cpu_mask);
135 now = ktime_get();
136
137 local_irq_save(flags);
138 tick_do_update_jiffies64(now);
139 local_irq_restore(flags);
140}
141
142/**
143 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
144 *
145 * When the next event is more than a tick into the future, stop the idle tick
146 * Called either from the idle loop or from irq_exit() when an idle period was
147 * just interrupted by an interrupt which did not cause a reschedule.
148 */
149void tick_nohz_stop_sched_tick(void)
150{
151 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
152 struct tick_sched *ts;
153 ktime_t last_update, expires, now, delta;
154 int cpu;
155
156 local_irq_save(flags);
157
158 cpu = smp_processor_id();
159 ts = &per_cpu(tick_cpu_sched, cpu);
160
161 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
162 goto end;
163
164 if (need_resched())
165 goto end;
166
167 cpu = smp_processor_id();
168 BUG_ON(local_softirq_pending());
169
170 now = ktime_get();
171 /*
172 * When called from irq_exit we need to account the idle sleep time
173 * correctly.
174 */
175 if (ts->tick_stopped) {
176 delta = ktime_sub(now, ts->idle_entrytime);
177 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
178 }
179
180 ts->idle_entrytime = now;
181 ts->idle_calls++;
182
183 /* Read jiffies and the time when jiffies were updated last */
184 do {
185 seq = read_seqbegin(&xtime_lock);
186 last_update = last_jiffies_update;
187 last_jiffies = jiffies;
188 } while (read_seqretry(&xtime_lock, seq));
189
190 /* Get the next timer wheel timer */
191 next_jiffies = get_next_timer_interrupt(last_jiffies);
192 delta_jiffies = next_jiffies - last_jiffies;
193
194 /*
195 * Do not stop the tick, if we are only one off
196 * or if the cpu is required for rcu
197 */
198 if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu)))
199 goto out;
200
201 /* Schedule the tick, if we are at least one jiffie off */
202 if ((long)delta_jiffies >= 1) {
203
204 if (rcu_needs_cpu(cpu))
205 delta_jiffies = 1;
206 else
207 cpu_set(cpu, nohz_cpu_mask);
208 /*
209 * nohz_stop_sched_tick can be called several times before
210 * the nohz_restart_sched_tick is called. This happens when
211 * interrupts arrive which do not cause a reschedule. In the
212 * first call we save the current tick time, so we can restart
213 * the scheduler tick in nohz_restart_sched_tick.
214 */
215 if (!ts->tick_stopped) {
216 ts->idle_tick = ts->sched_timer.expires;
217 ts->tick_stopped = 1;
218 ts->idle_jiffies = last_jiffies;
219 }
220 /*
221 * calculate the expiry time for the next timer wheel
222 * timer
223 */
224 expires = ktime_add_ns(last_update, tick_period.tv64 *
225 delta_jiffies);
226 ts->idle_expires = expires;
227 ts->idle_sleeps++;
228
229 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
230 hrtimer_start(&ts->sched_timer, expires,
231 HRTIMER_MODE_ABS);
232 /* Check, if the timer was already in the past */
233 if (hrtimer_active(&ts->sched_timer))
234 goto out;
235 } else if(!tick_program_event(expires, 0))
236 goto out;
237 /*
238 * We are past the event already. So we crossed a
239 * jiffie boundary. Update jiffies and raise the
240 * softirq.
241 */
242 tick_do_update_jiffies64(ktime_get());
243 cpu_clear(cpu, nohz_cpu_mask);
244 }
245 raise_softirq_irqoff(TIMER_SOFTIRQ);
246out:
247 ts->next_jiffies = next_jiffies;
248 ts->last_jiffies = last_jiffies;
249end:
250 local_irq_restore(flags);
251}
252
253/**
254 * nohz_restart_sched_tick - restart the idle tick from the idle task
255 *
256 * Restart the idle tick when the CPU is woken up from idle
257 */
258void tick_nohz_restart_sched_tick(void)
259{
260 int cpu = smp_processor_id();
261 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
262 unsigned long ticks;
263 ktime_t now, delta;
264
265 if (!ts->tick_stopped)
266 return;
267
268 /* Update jiffies first */
269 now = ktime_get();
270
271 local_irq_disable();
272 tick_do_update_jiffies64(now);
273 cpu_clear(cpu, nohz_cpu_mask);
274
275 /* Account the idle time */
276 delta = ktime_sub(now, ts->idle_entrytime);
277 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
278
279 /*
280 * We stopped the tick in idle. Update process times would miss the
281 * time we slept as update_process_times does only a 1 tick
282 * accounting. Enforce that this is accounted to idle !
283 */
284 ticks = jiffies - ts->idle_jiffies;
285 /*
286 * We might be one off. Do not randomly account a huge number of ticks!
287 */
288 if (ticks && ticks < LONG_MAX) {
289 add_preempt_count(HARDIRQ_OFFSET);
290 account_system_time(current, HARDIRQ_OFFSET,
291 jiffies_to_cputime(ticks));
292 sub_preempt_count(HARDIRQ_OFFSET);
293 }
294
295 /*
296 * Cancel the scheduled timer and restore the tick
297 */
298 ts->tick_stopped = 0;
299 hrtimer_cancel(&ts->sched_timer);
300 ts->sched_timer.expires = ts->idle_tick;
301
302 while (1) {
303 /* Forward the time to expire in the future */
304 hrtimer_forward(&ts->sched_timer, now, tick_period);
305
306 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
307 hrtimer_start(&ts->sched_timer,
308 ts->sched_timer.expires,
309 HRTIMER_MODE_ABS);
310 /* Check, if the timer was already in the past */
311 if (hrtimer_active(&ts->sched_timer))
312 break;
313 } else {
314 if (!tick_program_event(ts->sched_timer.expires, 0))
315 break;
316 }
317 /* Update jiffies and reread time */
318 tick_do_update_jiffies64(now);
319 now = ktime_get();
320 }
321 local_irq_enable();
322}
323
324static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
325{
326 hrtimer_forward(&ts->sched_timer, now, tick_period);
327 return tick_program_event(ts->sched_timer.expires, 0);
328}
329
330/*
331 * The nohz low res interrupt handler
332 */
333static void tick_nohz_handler(struct clock_event_device *dev)
334{
335 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
336 struct pt_regs *regs = get_irq_regs();
337 ktime_t now = ktime_get();
338
339 dev->next_event.tv64 = KTIME_MAX;
340
341 /* Check, if the jiffies need an update */
342 tick_do_update_jiffies64(now);
343
344 /*
345 * When we are idle and the tick is stopped, we have to touch
346 * the watchdog as we might not schedule for a really long
347 * time. This happens on complete idle SMP systems while
348 * waiting on the login prompt. We also increment the "start
349 * of idle" jiffy stamp so the idle accounting adjustment we
350 * do when we go busy again does not account too much ticks.
351 */
352 if (ts->tick_stopped) {
353 touch_softlockup_watchdog();
354 ts->idle_jiffies++;
355 }
356
357 update_process_times(user_mode(regs));
358 profile_tick(CPU_PROFILING);
359
360 /* Do not restart, when we are in the idle loop */
361 if (ts->tick_stopped)
362 return;
363
364 while (tick_nohz_reprogram(ts, now)) {
365 now = ktime_get();
366 tick_do_update_jiffies64(now);
367 }
368}
369
370/**
371 * tick_nohz_switch_to_nohz - switch to nohz mode
372 */
373static void tick_nohz_switch_to_nohz(void)
374{
375 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
376 ktime_t next;
377
378 if (!tick_nohz_enabled)
379 return;
380
381 local_irq_disable();
382 if (tick_switch_to_oneshot(tick_nohz_handler)) {
383 local_irq_enable();
384 return;
385 }
386
387 ts->nohz_mode = NOHZ_MODE_LOWRES;
388
389 /*
390 * Recycle the hrtimer in ts, so we can share the
391 * hrtimer_forward with the highres code.
392 */
393 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
394 /* Get the next period */
395 next = tick_init_jiffy_update();
396
397 for (;;) {
398 ts->sched_timer.expires = next;
399 if (!tick_program_event(next, 0))
400 break;
401 next = ktime_add(next, tick_period);
402 }
403 local_irq_enable();
404
405 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
406 smp_processor_id());
407}
408
409#else
410
411static inline void tick_nohz_switch_to_nohz(void) { }
412
413#endif /* NO_HZ */
414
415/*
416 * High resolution timer specific code
417 */
418#ifdef CONFIG_HIGH_RES_TIMERS
419/*
420 * We rearm the timer until we get disabled by the idle code
421 * Called with interrupts disabled and timer->base->cpu_base->lock held.
422 */
423static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
424{
425 struct tick_sched *ts =
426 container_of(timer, struct tick_sched, sched_timer);
427 struct hrtimer_cpu_base *base = timer->base->cpu_base;
428 struct pt_regs *regs = get_irq_regs();
429 ktime_t now = ktime_get();
430
431 /* Check, if the jiffies need an update */
432 tick_do_update_jiffies64(now);
433
434 /*
435 * Do not call, when we are not in irq context and have
436 * no valid regs pointer
437 */
438 if (regs) {
439 /*
440 * When we are idle and the tick is stopped, we have to touch
441 * the watchdog as we might not schedule for a really long
442 * time. This happens on complete idle SMP systems while
443 * waiting on the login prompt. We also increment the "start of
444 * idle" jiffy stamp so the idle accounting adjustment we do
445 * when we go busy again does not account too much ticks.
446 */
447 if (ts->tick_stopped) {
448 touch_softlockup_watchdog();
449 ts->idle_jiffies++;
450 }
451 /*
452 * update_process_times() might take tasklist_lock, hence
453 * drop the base lock. sched-tick hrtimers are per-CPU and
454 * never accessible by userspace APIs, so this is safe to do.
455 */
456 spin_unlock(&base->lock);
457 update_process_times(user_mode(regs));
458 profile_tick(CPU_PROFILING);
459 spin_lock(&base->lock);
460 }
461
462 /* Do not restart, when we are in the idle loop */
463 if (ts->tick_stopped)
464 return HRTIMER_NORESTART;
465
466 hrtimer_forward(timer, now, tick_period);
467
468 return HRTIMER_RESTART;
469}
470
471/**
472 * tick_setup_sched_timer - setup the tick emulation timer
473 */
474void tick_setup_sched_timer(void)
475{
476 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
477 ktime_t now = ktime_get();
478
479 /*
480 * Emulate tick processing via per-CPU hrtimers:
481 */
482 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
483 ts->sched_timer.function = tick_sched_timer;
484 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
485
486 /* Get the next period */
487 ts->sched_timer.expires = tick_init_jiffy_update();
488
489 for (;;) {
490 hrtimer_forward(&ts->sched_timer, now, tick_period);
491 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
492 HRTIMER_MODE_ABS);
493 /* Check, if the timer was already in the past */
494 if (hrtimer_active(&ts->sched_timer))
495 break;
496 now = ktime_get();
497 }
498
499#ifdef CONFIG_NO_HZ
500 if (tick_nohz_enabled)
501 ts->nohz_mode = NOHZ_MODE_HIGHRES;
502#endif
503}
504
505void tick_cancel_sched_timer(int cpu)
506{
507 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
508
509 if (ts->sched_timer.base)
510 hrtimer_cancel(&ts->sched_timer);
511 ts->tick_stopped = 0;
512 ts->nohz_mode = NOHZ_MODE_INACTIVE;
513}
514#endif /* HIGH_RES_TIMERS */
515
516/**
517 * Async notification about clocksource changes
518 */
519void tick_clock_notify(void)
520{
521 int cpu;
522
523 for_each_possible_cpu(cpu)
524 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
525}
526
527/*
528 * Async notification about clock event changes
529 */
530void tick_oneshot_notify(void)
531{
532 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
533
534 set_bit(0, &ts->check_clocks);
535}
536
537/**
538 * Check, if a change happened, which makes oneshot possible.
539 *
540 * Called cyclic from the hrtimer softirq (driven by the timer
541 * softirq) allow_nohz signals, that we can switch into low-res nohz
542 * mode, because high resolution timers are disabled (either compile
543 * or runtime).
544 */
545int tick_check_oneshot_change(int allow_nohz)
546{
547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
548
549 if (!test_and_clear_bit(0, &ts->check_clocks))
550 return 0;
551
552 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
553 return 0;
554
555 if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
556 return 0;
557
558 if (!allow_nohz)
559 return 1;
560
561 tick_nohz_switch_to_nohz();
562 return 0;
563}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000000..f82c635c3d5c
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
1/*
2 * kernel/time/timer_list.c
3 *
4 * List pending timers
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/sched.h>
17#include <linux/seq_file.h>
18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20
21#include <asm/uaccess.h>
22
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26
27/*
28 * This allows printing both to /proc/timer_list and
29 * to the console (on SysRq-Q):
30 */
31#define SEQ_printf(m, x...) \
32 do { \
33 if (m) \
34 seq_printf(m, x); \
35 else \
36 printk(x); \
37 } while (0)
38
39static void print_name_offset(struct seq_file *m, void *sym)
40{
41 unsigned long addr = (unsigned long)sym;
42 char namebuf[KSYM_NAME_LEN+1];
43 unsigned long size, offset;
44 const char *sym_name;
45 char *modname;
46
47 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
48 if (sym_name)
49 SEQ_printf(m, "%s", sym_name);
50 else
51 SEQ_printf(m, "<%p>", sym);
52}
53
54static void
55print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
56{
57#ifdef CONFIG_TIMER_STATS
58 char tmp[TASK_COMM_LEN + 1];
59#endif
60 SEQ_printf(m, " #%d: ", idx);
61 print_name_offset(m, timer);
62 SEQ_printf(m, ", ");
63 print_name_offset(m, timer->function);
64 SEQ_printf(m, ", S:%02lx", timer->state);
65#ifdef CONFIG_TIMER_STATS
66 SEQ_printf(m, ", ");
67 print_name_offset(m, timer->start_site);
68 memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
69 tmp[TASK_COMM_LEN] = 0;
70 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
71#endif
72 SEQ_printf(m, "\n");
73 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
74 (unsigned long long)ktime_to_ns(timer->expires),
75 (unsigned long long)(ktime_to_ns(timer->expires) - now));
76}
77
78static void
79print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
80 u64 now)
81{
82 struct hrtimer *timer, tmp;
83 unsigned long next = 0, i;
84 struct rb_node *curr;
85 unsigned long flags;
86
87next_one:
88 i = 0;
89 spin_lock_irqsave(&base->cpu_base->lock, flags);
90
91 curr = base->first;
92 /*
93 * Crude but we have to do this O(N*N) thing, because
94 * we have to unlock the base when printing:
95 */
96 while (curr && i < next) {
97 curr = rb_next(curr);
98 i++;
99 }
100
101 if (curr) {
102
103 timer = rb_entry(curr, struct hrtimer, node);
104 tmp = *timer;
105 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
106
107 print_timer(m, &tmp, i, now);
108 next++;
109 goto next_one;
110 }
111 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
112}
113
114static void
115print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
116{
117 SEQ_printf(m, " .index: %d\n",
118 base->index);
119 SEQ_printf(m, " .resolution: %Ld nsecs\n",
120 (unsigned long long)ktime_to_ns(base->resolution));
121 SEQ_printf(m, " .get_time: ");
122 print_name_offset(m, base->get_time);
123 SEQ_printf(m, "\n");
124#ifdef CONFIG_HIGH_RES_TIMERS
125 SEQ_printf(m, " .offset: %Ld nsecs\n",
126 ktime_to_ns(base->offset));
127#endif
128 SEQ_printf(m, "active timers:\n");
129 print_active_timers(m, base, now);
130}
131
132static void print_cpu(struct seq_file *m, int cpu, u64 now)
133{
134 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
135 int i;
136
137 SEQ_printf(m, "\ncpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i);
140 print_base(m, cpu_base->clock_base + i, now);
141 }
142#define P(x) \
143 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
144#define P_ns(x) \
145 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
146 (u64)(ktime_to_ns(cpu_base->x)))
147
148#ifdef CONFIG_HIGH_RES_TIMERS
149 P_ns(expires_next);
150 P(hres_active);
151 P(nr_events);
152#endif
153#undef P
154#undef P_ns
155
156#ifdef CONFIG_TICK_ONESHOT
157# define P(x) \
158 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
159# define P_ns(x) \
160 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
161 (u64)(ktime_to_ns(ts->x)))
162 {
163 struct tick_sched *ts = tick_get_tick_sched(cpu);
164 P(nohz_mode);
165 P_ns(idle_tick);
166 P(tick_stopped);
167 P(idle_jiffies);
168 P(idle_calls);
169 P(idle_sleeps);
170 P_ns(idle_entrytime);
171 P_ns(idle_sleeptime);
172 P(last_jiffies);
173 P(next_jiffies);
174 P_ns(idle_expires);
175 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
176 }
177#endif
178
179#undef P
180#undef P_ns
181}
182
183#ifdef CONFIG_GENERIC_CLOCKEVENTS
184static void
185print_tickdevice(struct seq_file *m, struct tick_device *td)
186{
187 struct clock_event_device *dev = td->evtdev;
188
189 SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode);
190
191 SEQ_printf(m, "Clock Event Device: ");
192 if (!dev) {
193 SEQ_printf(m, "<NULL>\n");
194 return;
195 }
196 SEQ_printf(m, "%s\n", dev->name);
197 SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
198 SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
199 SEQ_printf(m, " mult: %ld\n", dev->mult);
200 SEQ_printf(m, " shift: %d\n", dev->shift);
201 SEQ_printf(m, " mode: %d\n", dev->mode);
202 SEQ_printf(m, " next_event: %Ld nsecs\n",
203 (unsigned long long) ktime_to_ns(dev->next_event));
204
205 SEQ_printf(m, " set_next_event: ");
206 print_name_offset(m, dev->set_next_event);
207 SEQ_printf(m, "\n");
208
209 SEQ_printf(m, " set_mode: ");
210 print_name_offset(m, dev->set_mode);
211 SEQ_printf(m, "\n");
212
213 SEQ_printf(m, " event_handler: ");
214 print_name_offset(m, dev->event_handler);
215 SEQ_printf(m, "\n");
216}
217
218static void timer_list_show_tickdevices(struct seq_file *m)
219{
220 int cpu;
221
222#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
223 print_tickdevice(m, tick_get_broadcast_device());
224 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
225 tick_get_broadcast_mask()->bits[0]);
226#ifdef CONFIG_TICK_ONESHOT
227 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
228 tick_get_broadcast_oneshot_mask()->bits[0]);
229#endif
230 SEQ_printf(m, "\n");
231#endif
232 for_each_online_cpu(cpu)
233 print_tickdevice(m, tick_get_device(cpu));
234 SEQ_printf(m, "\n");
235}
236#else
237static void timer_list_show_tickdevices(struct seq_file *m) { }
238#endif
239
240static int timer_list_show(struct seq_file *m, void *v)
241{
242 u64 now = ktime_to_ns(ktime_get());
243 int cpu;
244
245 SEQ_printf(m, "Timer List Version: v0.3\n");
246 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
247 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
248
249 for_each_online_cpu(cpu)
250 print_cpu(m, cpu, now);
251
252 SEQ_printf(m, "\n");
253 timer_list_show_tickdevices(m);
254
255 return 0;
256}
257
258void sysrq_timer_list_show(void)
259{
260 timer_list_show(NULL, NULL);
261}
262
263static int timer_list_open(struct inode *inode, struct file *filp)
264{
265 return single_open(filp, timer_list_show, NULL);
266}
267
268static struct file_operations timer_list_fops = {
269 .open = timer_list_open,
270 .read = seq_read,
271 .llseek = seq_lseek,
272 .release = seq_release,
273};
274
275static int __init init_timer_list_procfs(void)
276{
277 struct proc_dir_entry *pe;
278
279 pe = create_proc_entry("timer_list", 0644, NULL);
280 if (!pe)
281 return -ENOMEM;
282
283 pe->proc_fops = &timer_list_fops;
284
285 return 0;
286}
287__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000000..1bc4882e28e0
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
1/*
2 * kernel/time/timer_stats.c
3 *
4 * Collect timer usage statistics.
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * timer_stats is based on timer_top, a similar functionality which was part of
10 * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
11 * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
12 * on dynamic allocation of the statistics entries and linear search based
13 * lookup combined with a global lock, rather than the static array, hash
14 * and per-CPU locking which is used by timer_stats. It was written for the
15 * pre hrtimer kernel code and therefore did not take hrtimers into account.
16 * Nevertheless it provided the base for the timer_stats implementation and
17 * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
18 * for this effort.
19 *
20 * timer_top.c is
21 * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
22 * Written by Daniel Petrini <d.pensator@gmail.com>
23 * timer_top.c was released under the GNU General Public License version 2
24 *
25 * We export the addresses and counting of timer functions being called,
26 * the pid and cmdline from the owner process if applicable.
27 *
28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats
30 *
31 * Display the information collected so far:
32 * # cat /proc/timer_stats
33 *
34 * This program is free software; you can redistribute it and/or modify
35 * it under the terms of the GNU General Public License version 2 as
36 * published by the Free Software Foundation.
37 */
38
39#include <linux/proc_fs.h>
40#include <linux/module.h>
41#include <linux/spinlock.h>
42#include <linux/sched.h>
43#include <linux/seq_file.h>
44#include <linux/kallsyms.h>
45
46#include <asm/uaccess.h>
47
48/*
49 * This is our basic unit of interest: a timer expiry event identified
50 * by the timer, its start/expire functions and the PID of the task that
51 * started the timer. We count the number of times an event happens:
52 */
53struct entry {
54 /*
55 * Hash list:
56 */
57 struct entry *next;
58
59 /*
60 * Hash keys:
61 */
62 void *timer;
63 void *start_func;
64 void *expire_func;
65 pid_t pid;
66
67 /*
68 * Number of timeout events:
69 */
70 unsigned long count;
71
72 /*
73 * We save the command-line string to preserve
74 * this information past task exit:
75 */
76 char comm[TASK_COMM_LEN + 1];
77
78} ____cacheline_aligned_in_smp;
79
80/*
81 * Spinlock protecting the tables - not taken during lookup:
82 */
83static DEFINE_SPINLOCK(table_lock);
84
85/*
86 * Per-CPU lookup locks for fast hash lookup:
87 */
88static DEFINE_PER_CPU(spinlock_t, lookup_lock);
89
90/*
91 * Mutex to serialize state changes with show-stats activities:
92 */
93static DEFINE_MUTEX(show_mutex);
94
95/*
96 * Collection status, active/inactive:
97 */
98static int __read_mostly active;
99
100/*
101 * Beginning/end timestamps of measurement:
102 */
103static ktime_t time_start, time_stop;
104
105/*
106 * tstat entry structs only get allocated while collection is
107 * active and never freed during that time - this simplifies
108 * things quite a bit.
109 *
110 * They get freed when a new collection period is started.
111 */
112#define MAX_ENTRIES_BITS 10
113#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
114
115static unsigned long nr_entries;
116static struct entry entries[MAX_ENTRIES];
117
118static atomic_t overflow_count;
119
120static void reset_entries(void)
121{
122 nr_entries = 0;
123 memset(entries, 0, sizeof(entries));
124 atomic_set(&overflow_count, 0);
125}
126
127static struct entry *alloc_entry(void)
128{
129 if (nr_entries >= MAX_ENTRIES)
130 return NULL;
131
132 return entries + nr_entries++;
133}
134
135/*
136 * The entries are in a hash-table, for fast lookup:
137 */
138#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
139#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
140#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
141
142#define __tstat_hashfn(entry) \
143 (((unsigned long)(entry)->timer ^ \
144 (unsigned long)(entry)->start_func ^ \
145 (unsigned long)(entry)->expire_func ^ \
146 (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
147
148#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
149
150static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
151
152static int match_entries(struct entry *entry1, struct entry *entry2)
153{
154 return entry1->timer == entry2->timer &&
155 entry1->start_func == entry2->start_func &&
156 entry1->expire_func == entry2->expire_func &&
157 entry1->pid == entry2->pid;
158}
159
160/*
161 * Look up whether an entry matching this item is present
162 * in the hash already. Must be called with irqs off and the
163 * lookup lock held:
164 */
165static struct entry *tstat_lookup(struct entry *entry, char *comm)
166{
167 struct entry **head, *curr, *prev;
168
169 head = tstat_hashentry(entry);
170 curr = *head;
171
172 /*
173 * The fastpath is when the entry is already hashed,
174 * we do this with the lookup lock held, but with the
175 * table lock not held:
176 */
177 while (curr) {
178 if (match_entries(curr, entry))
179 return curr;
180
181 curr = curr->next;
182 }
183 /*
184 * Slowpath: allocate, set up and link a new hash entry:
185 */
186 prev = NULL;
187 curr = *head;
188
189 spin_lock(&table_lock);
190 /*
191 * Make sure we have not raced with another CPU:
192 */
193 while (curr) {
194 if (match_entries(curr, entry))
195 goto out_unlock;
196
197 prev = curr;
198 curr = curr->next;
199 }
200
201 curr = alloc_entry();
202 if (curr) {
203 *curr = *entry;
204 curr->count = 0;
205 memcpy(curr->comm, comm, TASK_COMM_LEN);
206 if (prev)
207 prev->next = curr;
208 else
209 *head = curr;
210 curr->next = NULL;
211 }
212 out_unlock:
213 spin_unlock(&table_lock);
214
215 return curr;
216}
217
218/**
219 * timer_stats_update_stats - Update the statistics for a timer.
220 * @timer: pointer to either a timer_list or a hrtimer
221 * @pid: the pid of the task which set up the timer
222 * @startf: pointer to the function which did the timer setup
223 * @timerf: pointer to the timer callback function of the timer
224 * @comm: name of the process which set up the timer
225 *
226 * When the timer is already registered, then the event counter is
227 * incremented. Otherwise the timer is registered in a free slot.
228 */
229void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
230 void *timerf, char * comm)
231{
232 /*
233 * It doesnt matter which lock we take:
234 */
235 spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
236 struct entry *entry, input;
237 unsigned long flags;
238
239 input.timer = timer;
240 input.start_func = startf;
241 input.expire_func = timerf;
242 input.pid = pid;
243
244 spin_lock_irqsave(lock, flags);
245 if (!active)
246 goto out_unlock;
247
248 entry = tstat_lookup(&input, comm);
249 if (likely(entry))
250 entry->count++;
251 else
252 atomic_inc(&overflow_count);
253
254 out_unlock:
255 spin_unlock_irqrestore(lock, flags);
256}
257
258static void print_name_offset(struct seq_file *m, unsigned long addr)
259{
260 char namebuf[KSYM_NAME_LEN+1];
261 unsigned long size, offset;
262 const char *sym_name;
263 char *modname;
264
265 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
266 if (sym_name)
267 seq_printf(m, "%s", sym_name);
268 else
269 seq_printf(m, "<%p>", (void *)addr);
270}
271
272static int tstats_show(struct seq_file *m, void *v)
273{
274 struct timespec period;
275 struct entry *entry;
276 unsigned long ms;
277 long events = 0;
278 ktime_t time;
279 int i;
280
281 mutex_lock(&show_mutex);
282 /*
283 * If still active then calculate up to now:
284 */
285 if (active)
286 time_stop = ktime_get();
287
288 time = ktime_sub(time_stop, time_start);
289
290 period = ktime_to_timespec(time);
291 ms = period.tv_nsec / 1000000;
292
293 seq_puts(m, "Timer Stats Version: v0.1\n");
294 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
295 if (atomic_read(&overflow_count))
296 seq_printf(m, "Overflow: %d entries\n",
297 atomic_read(&overflow_count));
298
299 for (i = 0; i < nr_entries; i++) {
300 entry = entries + i;
301 seq_printf(m, "%4lu, %5d %-16s ",
302 entry->count, entry->pid, entry->comm);
303
304 print_name_offset(m, (unsigned long)entry->start_func);
305 seq_puts(m, " (");
306 print_name_offset(m, (unsigned long)entry->expire_func);
307 seq_puts(m, ")\n");
308
309 events += entry->count;
310 }
311
312 ms += period.tv_sec * 1000;
313 if (!ms)
314 ms = 1;
315
316 if (events && period.tv_sec)
317 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
318 events / period.tv_sec, events * 1000 / ms);
319 else
320 seq_printf(m, "%ld total events\n", events);
321
322 mutex_unlock(&show_mutex);
323
324 return 0;
325}
326
327/*
328 * After a state change, make sure all concurrent lookup/update
329 * activities have stopped:
330 */
331static void sync_access(void)
332{
333 unsigned long flags;
334 int cpu;
335
336 for_each_online_cpu(cpu) {
337 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
338 /* nothing */
339 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
340 }
341}
342
343static ssize_t tstats_write(struct file *file, const char __user *buf,
344 size_t count, loff_t *offs)
345{
346 char ctl[2];
347
348 if (count != 2 || *offs)
349 return -EINVAL;
350
351 if (copy_from_user(ctl, buf, count))
352 return -EFAULT;
353
354 mutex_lock(&show_mutex);
355 switch (ctl[0]) {
356 case '0':
357 if (active) {
358 active = 0;
359 time_stop = ktime_get();
360 sync_access();
361 }
362 break;
363 case '1':
364 if (!active) {
365 reset_entries();
366 time_start = ktime_get();
367 active = 1;
368 }
369 break;
370 default:
371 count = -EINVAL;
372 }
373 mutex_unlock(&show_mutex);
374
375 return count;
376}
377
378static int tstats_open(struct inode *inode, struct file *filp)
379{
380 return single_open(filp, tstats_show, NULL);
381}
382
383static struct file_operations tstats_fops = {
384 .open = tstats_open,
385 .read = seq_read,
386 .write = tstats_write,
387 .llseek = seq_lseek,
388 .release = seq_release,
389};
390
391void __init init_timer_stats(void)
392{
393 int cpu;
394
395 for_each_possible_cpu(cpu)
396 spin_lock_init(&per_cpu(lookup_lock, cpu));
397}
398
399static int __init init_tstats_procfs(void)
400{
401 struct proc_dir_entry *pe;
402
403 pe = create_proc_entry("timer_stats", 0644, NULL);
404 if (!pe)
405 return -ENOMEM;
406
407 pe->proc_fops = &tstats_fops;
408
409 return 0;
410}
411__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index 4902181e10e6..cb1b86a9c52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,8 @@
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/tick.h>
38#include <linux/kallsyms.h>
37 39
38#include <asm/uaccess.h> 40#include <asm/uaccess.h>
39#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
262 list_add_tail(&timer->entry, vec); 264 list_add_tail(&timer->entry, vec);
263} 265}
264 266
267#ifdef CONFIG_TIMER_STATS
268void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
269{
270 if (timer->start_site)
271 return;
272
273 timer->start_site = addr;
274 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
275 timer->start_pid = current->pid;
276}
277#endif
278
265/** 279/**
266 * init_timer - initialize a timer. 280 * init_timer - initialize a timer.
267 * @timer: the timer to be initialized 281 * @timer: the timer to be initialized
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
273{ 287{
274 timer->entry.next = NULL; 288 timer->entry.next = NULL;
275 timer->base = __raw_get_cpu_var(tvec_bases); 289 timer->base = __raw_get_cpu_var(tvec_bases);
290#ifdef CONFIG_TIMER_STATS
291 timer->start_site = NULL;
292 timer->start_pid = -1;
293 memset(timer->start_comm, 0, TASK_COMM_LEN);
294#endif
276} 295}
277EXPORT_SYMBOL(init_timer); 296EXPORT_SYMBOL(init_timer);
278 297
279static inline void detach_timer(struct timer_list *timer, 298static inline void detach_timer(struct timer_list *timer,
280 int clear_pending) 299 int clear_pending)
281{ 300{
282 struct list_head *entry = &timer->entry; 301 struct list_head *entry = &timer->entry;
283 302
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
324 unsigned long flags; 343 unsigned long flags;
325 int ret = 0; 344 int ret = 0;
326 345
346 timer_stats_timer_set_start_info(timer);
327 BUG_ON(!timer->function); 347 BUG_ON(!timer->function);
328 348
329 base = lock_timer_base(timer, &flags); 349 base = lock_timer_base(timer, &flags);
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
374 tvec_base_t *base = per_cpu(tvec_bases, cpu); 394 tvec_base_t *base = per_cpu(tvec_bases, cpu);
375 unsigned long flags; 395 unsigned long flags;
376 396
397 timer_stats_timer_set_start_info(timer);
377 BUG_ON(timer_pending(timer) || !timer->function); 398 BUG_ON(timer_pending(timer) || !timer->function);
378 spin_lock_irqsave(&base->lock, flags); 399 spin_lock_irqsave(&base->lock, flags);
379 timer->base = base; 400 timer->base = base;
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
406{ 427{
407 BUG_ON(!timer->function); 428 BUG_ON(!timer->function);
408 429
430 timer_stats_timer_set_start_info(timer);
409 /* 431 /*
410 * This is a common optimization triggered by the 432 * This is a common optimization triggered by the
411 * networking code - if the timer is re-modified 433 * networking code - if the timer is re-modified
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer)
436 unsigned long flags; 458 unsigned long flags;
437 int ret = 0; 459 int ret = 0;
438 460
461 timer_stats_timer_clear_start_info(timer);
439 if (timer_pending(timer)) { 462 if (timer_pending(timer)) {
440 base = lock_timer_base(timer, &flags); 463 base = lock_timer_base(timer, &flags);
441 if (timer_pending(timer)) { 464 if (timer_pending(timer)) {
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
569 fn = timer->function; 592 fn = timer->function;
570 data = timer->data; 593 data = timer->data;
571 594
595 timer_stats_account_timer(timer);
596
572 set_running_timer(base, timer); 597 set_running_timer(base, timer);
573 detach_timer(timer, 1); 598 detach_timer(timer, 1);
574 spin_unlock_irq(&base->lock); 599 spin_unlock_irq(&base->lock);
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base)
591 spin_unlock_irq(&base->lock); 616 spin_unlock_irq(&base->lock);
592} 617}
593 618
594#ifdef CONFIG_NO_IDLE_HZ 619#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
595/* 620/*
596 * Find out when the next timer event is due to happen. This 621 * Find out when the next timer event is due to happen. This
597 * is used on S/390 to stop all activity when a cpus is idle. 622 * is used on S/390 to stop all activity when a cpus is idle.
598 * This functions needs to be called disabled. 623 * This functions needs to be called disabled.
599 */ 624 */
600unsigned long next_timer_interrupt(void) 625static unsigned long __next_timer_interrupt(tvec_base_t *base)
601{ 626{
602 tvec_base_t *base; 627 unsigned long timer_jiffies = base->timer_jiffies;
603 struct list_head *list; 628 unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
629 int index, slot, array, found = 0;
604 struct timer_list *nte; 630 struct timer_list *nte;
605 unsigned long expires;
606 unsigned long hr_expires = MAX_JIFFY_OFFSET;
607 ktime_t hr_delta;
608 tvec_t *varray[4]; 631 tvec_t *varray[4];
609 int i, j;
610
611 hr_delta = hrtimer_get_next_event();
612 if (hr_delta.tv64 != KTIME_MAX) {
613 struct timespec tsdelta;
614 tsdelta = ktime_to_timespec(hr_delta);
615 hr_expires = timespec_to_jiffies(&tsdelta);
616 if (hr_expires < 3)
617 return hr_expires + jiffies;
618 }
619 hr_expires += jiffies;
620
621 base = __get_cpu_var(tvec_bases);
622 spin_lock(&base->lock);
623 expires = base->timer_jiffies + (LONG_MAX >> 1);
624 list = NULL;
625 632
626 /* Look for timer events in tv1. */ 633 /* Look for timer events in tv1. */
627 j = base->timer_jiffies & TVR_MASK; 634 index = slot = timer_jiffies & TVR_MASK;
628 do { 635 do {
629 list_for_each_entry(nte, base->tv1.vec + j, entry) { 636 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
637 found = 1;
630 expires = nte->expires; 638 expires = nte->expires;
631 if (j < (base->timer_jiffies & TVR_MASK)) 639 /* Look at the cascade bucket(s)? */
632 list = base->tv2.vec + (INDEX(0)); 640 if (!index || slot < index)
633 goto found; 641 goto cascade;
642 return expires;
634 } 643 }
635 j = (j + 1) & TVR_MASK; 644 slot = (slot + 1) & TVR_MASK;
636 } while (j != (base->timer_jiffies & TVR_MASK)); 645 } while (slot != index);
646
647cascade:
648 /* Calculate the next cascade event */
649 if (index)
650 timer_jiffies += TVR_SIZE - index;
651 timer_jiffies >>= TVR_BITS;
637 652
638 /* Check tv2-tv5. */ 653 /* Check tv2-tv5. */
639 varray[0] = &base->tv2; 654 varray[0] = &base->tv2;
640 varray[1] = &base->tv3; 655 varray[1] = &base->tv3;
641 varray[2] = &base->tv4; 656 varray[2] = &base->tv4;
642 varray[3] = &base->tv5; 657 varray[3] = &base->tv5;
643 for (i = 0; i < 4; i++) { 658
644 j = INDEX(i); 659 for (array = 0; array < 4; array++) {
660 tvec_t *varp = varray[array];
661
662 index = slot = timer_jiffies & TVN_MASK;
645 do { 663 do {
646 if (list_empty(varray[i]->vec + j)) { 664 list_for_each_entry(nte, varp->vec + slot, entry) {
647 j = (j + 1) & TVN_MASK; 665 found = 1;
648 continue;
649 }
650 list_for_each_entry(nte, varray[i]->vec + j, entry)
651 if (time_before(nte->expires, expires)) 666 if (time_before(nte->expires, expires))
652 expires = nte->expires; 667 expires = nte->expires;
653 if (j < (INDEX(i)) && i < 3) 668 }
654 list = varray[i + 1]->vec + (INDEX(i + 1)); 669 /*
655 goto found; 670 * Do we still search for the first timer or are
656 } while (j != (INDEX(i))); 671 * we looking up the cascade buckets ?
657 } 672 */
658found: 673 if (found) {
659 if (list) { 674 /* Look at the cascade bucket(s)? */
660 /* 675 if (!index || slot < index)
661 * The search wrapped. We need to look at the next list 676 break;
662 * from next tv element that would cascade into tv element 677 return expires;
663 * where we found the timer element. 678 }
664 */ 679 slot = (slot + 1) & TVN_MASK;
665 list_for_each_entry(nte, list, entry) { 680 } while (slot != index);
666 if (time_before(nte->expires, expires)) 681
667 expires = nte->expires; 682 if (index)
668 } 683 timer_jiffies += TVN_SIZE - index;
684 timer_jiffies >>= TVN_BITS;
669 } 685 }
670 spin_unlock(&base->lock); 686 return expires;
687}
671 688
672 /* 689/*
673 * It can happen that other CPUs service timer IRQs and increment 690 * Check, if the next hrtimer event is before the next timer wheel
674 * jiffies, but we have not yet got a local timer tick to process 691 * event:
675 * the timer wheels. In that case, the expiry time can be before 692 */
676 * jiffies, but since the high-resolution timer here is relative to 693static unsigned long cmp_next_hrtimer_event(unsigned long now,
677 * jiffies, the default expression when high-resolution timers are 694 unsigned long expires)
678 * not active, 695{
679 * 696 ktime_t hr_delta = hrtimer_get_next_event();
680 * time_before(MAX_JIFFY_OFFSET + jiffies, expires) 697 struct timespec tsdelta;
681 * 698
682 * would falsely evaluate to true. If that is the case, just 699 if (hr_delta.tv64 == KTIME_MAX)
683 * return jiffies so that we can immediately fire the local timer 700 return expires;
684 */
685 if (time_before(expires, jiffies))
686 return jiffies;
687 701
688 if (time_before(hr_expires, expires)) 702 if (hr_delta.tv64 <= TICK_NSEC)
689 return hr_expires; 703 return now;
690 704
705 tsdelta = ktime_to_timespec(hr_delta);
706 now += timespec_to_jiffies(&tsdelta);
707 if (time_before(now, expires))
708 return now;
691 return expires; 709 return expires;
692} 710}
711
712/**
713 * next_timer_interrupt - return the jiffy of the next pending timer
714 */
715unsigned long get_next_timer_interrupt(unsigned long now)
716{
717 tvec_base_t *base = __get_cpu_var(tvec_bases);
718 unsigned long expires;
719
720 spin_lock(&base->lock);
721 expires = __next_timer_interrupt(base);
722 spin_unlock(&base->lock);
723
724 if (time_before_eq(expires, now))
725 return now;
726
727 return cmp_next_hrtimer_event(now, expires);
728}
729
730#ifdef CONFIG_NO_IDLE_HZ
731unsigned long next_timer_interrupt(void)
732{
733 return get_next_timer_interrupt(jiffies);
734}
735#endif
736
693#endif 737#endif
694 738
695/******************************************************************/ 739/******************************************************************/
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday);
832 * 876 *
833 * Accumulates current time interval and initializes new clocksource 877 * Accumulates current time interval and initializes new clocksource
834 */ 878 */
835static int change_clocksource(void) 879static void change_clocksource(void)
836{ 880{
837 struct clocksource *new; 881 struct clocksource *new;
838 cycle_t now; 882 cycle_t now;
839 u64 nsec; 883 u64 nsec;
884
840 new = clocksource_get_next(); 885 new = clocksource_get_next();
841 if (clock != new) { 886
842 now = clocksource_read(new); 887 if (clock == new)
843 nsec = __get_nsec_offset(); 888 return;
844 timespec_add_ns(&xtime, nsec); 889
845 890 now = clocksource_read(new);
846 clock = new; 891 nsec = __get_nsec_offset();
847 clock->cycle_last = now; 892 timespec_add_ns(&xtime, nsec);
848 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 893
849 clock->name); 894 clock = new;
850 return 1; 895 clock->cycle_last = now;
851 } else if (clock->update_callback) { 896
852 return clock->update_callback(); 897 clock->error = 0;
853 } 898 clock->xtime_nsec = 0;
854 return 0; 899 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
900
901 tick_clock_notify();
902
903 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
904 clock->name);
855} 905}
856#else 906#else
857static inline int change_clocksource(void) 907static inline void change_clocksource(void) { }
858{
859 return 0;
860}
861#endif 908#endif
862 909
863/** 910/**
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void)
871 do { 918 do {
872 seq = read_seqbegin(&xtime_lock); 919 seq = read_seqbegin(&xtime_lock);
873 920
874 ret = clock->is_continuous; 921 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
875 922
876 } while (read_seqretry(&xtime_lock, seq)); 923 } while (read_seqretry(&xtime_lock, seq));
877 924
878 return ret; 925 return ret;
879} 926}
880 927
928/**
929 * read_persistent_clock - Return time in seconds from the persistent clock.
930 *
931 * Weak dummy function for arches that do not yet support it.
932 * Returns seconds from epoch using the battery backed persistent clock.
933 * Returns zero if unsupported.
934 *
935 * XXX - Do be sure to remove it once all arches implement it.
936 */
937unsigned long __attribute__((weak)) read_persistent_clock(void)
938{
939 return 0;
940}
941
881/* 942/*
882 * timekeeping_init - Initializes the clocksource and common timekeeping values 943 * timekeeping_init - Initializes the clocksource and common timekeeping values
883 */ 944 */
884void __init timekeeping_init(void) 945void __init timekeeping_init(void)
885{ 946{
886 unsigned long flags; 947 unsigned long flags;
948 unsigned long sec = read_persistent_clock();
887 949
888 write_seqlock_irqsave(&xtime_lock, flags); 950 write_seqlock_irqsave(&xtime_lock, flags);
889 951
890 ntp_clear(); 952 ntp_clear();
891 953
892 clock = clocksource_get_next(); 954 clock = clocksource_get_next();
893 clocksource_calculate_interval(clock, tick_nsec); 955 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
894 clock->cycle_last = clocksource_read(clock); 956 clock->cycle_last = clocksource_read(clock);
895 957
958 xtime.tv_sec = sec;
959 xtime.tv_nsec = 0;
960 set_normalized_timespec(&wall_to_monotonic,
961 -xtime.tv_sec, -xtime.tv_nsec);
962
896 write_sequnlock_irqrestore(&xtime_lock, flags); 963 write_sequnlock_irqrestore(&xtime_lock, flags);
897} 964}
898 965
899 966/* flag for if timekeeping is suspended */
900static int timekeeping_suspended; 967static int timekeeping_suspended;
968/* time in seconds when suspend began */
969static unsigned long timekeeping_suspend_time;
970
901/** 971/**
902 * timekeeping_resume - Resumes the generic timekeeping subsystem. 972 * timekeeping_resume - Resumes the generic timekeeping subsystem.
903 * @dev: unused 973 * @dev: unused
@@ -909,13 +979,26 @@ static int timekeeping_suspended;
909static int timekeeping_resume(struct sys_device *dev) 979static int timekeeping_resume(struct sys_device *dev)
910{ 980{
911 unsigned long flags; 981 unsigned long flags;
982 unsigned long now = read_persistent_clock();
912 983
913 write_seqlock_irqsave(&xtime_lock, flags); 984 write_seqlock_irqsave(&xtime_lock, flags);
914 /* restart the last cycle value */ 985
986 if (now && (now > timekeeping_suspend_time)) {
987 unsigned long sleep_length = now - timekeeping_suspend_time;
988
989 xtime.tv_sec += sleep_length;
990 wall_to_monotonic.tv_sec -= sleep_length;
991 }
992 /* re-base the last cycle value */
915 clock->cycle_last = clocksource_read(clock); 993 clock->cycle_last = clocksource_read(clock);
916 clock->error = 0; 994 clock->error = 0;
917 timekeeping_suspended = 0; 995 timekeeping_suspended = 0;
918 write_sequnlock_irqrestore(&xtime_lock, flags); 996 write_sequnlock_irqrestore(&xtime_lock, flags);
997
998 touch_softlockup_watchdog();
999 /* Resume hrtimers */
1000 clock_was_set();
1001
919 return 0; 1002 return 0;
920} 1003}
921 1004
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
925 1008
926 write_seqlock_irqsave(&xtime_lock, flags); 1009 write_seqlock_irqsave(&xtime_lock, flags);
927 timekeeping_suspended = 1; 1010 timekeeping_suspended = 1;
1011 timekeeping_suspend_time = read_persistent_clock();
928 write_sequnlock_irqrestore(&xtime_lock, flags); 1012 write_sequnlock_irqrestore(&xtime_lock, flags);
929 return 0; 1013 return 0;
930} 1014}
@@ -1089,11 +1173,8 @@ static void update_wall_time(void)
1089 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 1173 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1090 1174
1091 /* check to see if there is a new clocksource to use */ 1175 /* check to see if there is a new clocksource to use */
1092 if (change_clocksource()) { 1176 change_clocksource();
1093 clock->error = 0; 1177 update_vsyscall(&xtime, clock);
1094 clock->xtime_nsec = 0;
1095 clocksource_calculate_interval(clock, tick_nsec);
1096 }
1097} 1178}
1098 1179
1099/* 1180/*
@@ -1173,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h)
1173{ 1254{
1174 tvec_base_t *base = __get_cpu_var(tvec_bases); 1255 tvec_base_t *base = __get_cpu_var(tvec_bases);
1175 1256
1176 hrtimer_run_queues(); 1257 hrtimer_run_queues();
1258
1177 if (time_after_eq(jiffies, base->timer_jiffies)) 1259 if (time_after_eq(jiffies, base->timer_jiffies))
1178 __run_timers(base); 1260 __run_timers(base);
1179} 1261}
@@ -1619,6 +1701,8 @@ void __init init_timers(void)
1619 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1701 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1620 (void *)(long)smp_processor_id()); 1702 (void *)(long)smp_processor_id());
1621 1703
1704 init_timer_stats();
1705
1622 BUG_ON(err == NOTIFY_BAD); 1706 BUG_ON(err == NOTIFY_BAD);
1623 register_cpu_notifier(&timers_nb); 1707 register_cpu_notifier(&timers_nb);
1624 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1708 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index baacc3691415..658f638c402c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -22,8 +22,6 @@
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24 24
25
26#define USEC_PER_TICK (USEC_PER_SEC/HZ)
27/* 25/*
28 * fill in basic accounting fields 26 * fill in basic accounting fields
29 */ 27 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 020d1fff57dc..b6fa5e63085d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
218} 218}
219EXPORT_SYMBOL_GPL(queue_work); 219EXPORT_SYMBOL_GPL(queue_work);
220 220
221static void delayed_work_timer_fn(unsigned long __data) 221void delayed_work_timer_fn(unsigned long __data)
222{ 222{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
245 struct timer_list *timer = &dwork->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work; 246 struct work_struct *work = &dwork->work;
247 247
248 timer_stats_timer_set_start_info(timer);
248 if (delay == 0) 249 if (delay == 0)
249 return queue_work(wq, work); 250 return queue_work(wq, work);
250 251
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
593 * After waiting for a given time this puts a job in the kernel-global 594 * After waiting for a given time this puts a job in the kernel-global
594 * workqueue. 595 * workqueue.
595 */ 596 */
596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) 597int fastcall schedule_delayed_work(struct delayed_work *dwork,
598 unsigned long delay)
597{ 599{
600 timer_stats_timer_set_start_info(&dwork->timer);
598 return queue_delayed_work(keventd_wq, dwork, delay); 601 return queue_delayed_work(keventd_wq, dwork, delay);
599} 602}
600EXPORT_SYMBOL(schedule_delayed_work); 603EXPORT_SYMBOL(schedule_delayed_work);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 63f04c15e6f5..4448f91b865c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -134,6 +134,17 @@ config SCHEDSTATS
134 application, you can say N to avoid the very slight overhead 134 application, you can say N to avoid the very slight overhead
135 this adds. 135 this adds.
136 136
137config TIMER_STATS
138 bool "Collect kernel timers statistics"
139 depends on DEBUG_KERNEL && PROC_FS
140 help
141 If you say Y here, additional code will be inserted into the
142 timer routines to collect statistics about kernel timers being
143 reprogrammed. The statistics can be read from /proc/timer_stats.
144 The statistics collection is started by writing 1 to /proc/timer_stats,
145 writing 0 stops it. This feature is useful to collect information
146 about timer usage patterns in kernel and userspace.
147
137config DEBUG_SLAB 148config DEBUG_SLAB
138 bool "Debug slab memory allocations" 149 bool "Debug slab memory allocations"
139 depends on DEBUG_KERNEL && SLAB 150 depends on DEBUG_KERNEL && SLAB
diff --git a/lib/devres.c b/lib/devres.c
index 2a668dd7cac7..eb38849aa717 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -274,21 +274,21 @@ int pcim_iomap_regions(struct pci_dev *pdev, u16 mask, const char *name)
274 274
275 rc = pci_request_region(pdev, i, name); 275 rc = pci_request_region(pdev, i, name);
276 if (rc) 276 if (rc)
277 goto err_region; 277 goto err_inval;
278 278
279 rc = -ENOMEM; 279 rc = -ENOMEM;
280 if (!pcim_iomap(pdev, i, 0)) 280 if (!pcim_iomap(pdev, i, 0))
281 goto err_iomap; 281 goto err_region;
282 } 282 }
283 283
284 return 0; 284 return 0;
285 285
286 err_iomap:
287 pcim_iounmap(pdev, iomap[i]);
288 err_region: 286 err_region:
289 pci_release_region(pdev, i); 287 pci_release_region(pdev, i);
290 err_inval: 288 err_inval:
291 while (--i >= 0) { 289 while (--i >= 0) {
290 if (!(mask & (1 << i)))
291 continue;
292 pcim_iounmap(pdev, iomap[i]); 292 pcim_iounmap(pdev, iomap[i]);
293 pci_release_region(pdev, i); 293 pci_release_region(pdev, i);
294 } 294 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 00414849a867..d1060b8d3cd6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2079,21 +2079,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2079 /* Limit the size of the copy to the caller's write size */ 2079 /* Limit the size of the copy to the caller's write size */
2080 bytes = min(bytes, count); 2080 bytes = min(bytes, count);
2081 2081
2082 /* 2082 /* We only need to worry about prefaulting when writes are from
2083 * Limit the size of the copy to that of the current segment, 2083 * user-space. NFSd uses vfs_writev with several non-aligned
2084 * because fault_in_pages_readable() doesn't know how to walk 2084 * segments in the vector, and limiting to one segment a time is
2085 * segments. 2085 * a noticeable performance for re-write
2086 */ 2086 */
2087 bytes = min(bytes, cur_iov->iov_len - iov_base); 2087 if (!segment_eq(get_fs(), KERNEL_DS)) {
2088 2088 /*
2089 /* 2089 * Limit the size of the copy to that of the current
2090 * Bring in the user page that we will copy from _first_. 2090 * segment, because fault_in_pages_readable() doesn't
2091 * Otherwise there's a nasty deadlock on copying from the 2091 * know how to walk segments.
2092 * same page as we're writing to, without it being marked 2092 */
2093 * up-to-date. 2093 bytes = min(bytes, cur_iov->iov_len - iov_base);
2094 */
2095 fault_in_pages_readable(buf, bytes);
2096 2094
2095 /*
2096 * Bring in the user page that we will copy from
2097 * _first_. Otherwise there's a nasty deadlock on
2098 * copying from the same page as we're writing to,
2099 * without it being marked up-to-date.
2100 */
2101 fault_in_pages_readable(buf, bytes);
2102 }
2097 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); 2103 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
2098 if (!page) { 2104 if (!page) {
2099 status = -ENOMEM; 2105 status = -ENOMEM;