aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/e100.txt158
-rw-r--r--Documentation/networking/e1000.txt620
-rw-r--r--MAINTAINERS16
-rw-r--r--arch/alpha/mm/init.c2
-rw-r--r--arch/arm/mm/consistent.c4
-rw-r--r--arch/arm/mm/init.c2
-rw-r--r--arch/arm26/mm/init.c2
-rw-r--r--arch/cris/mm/init.c2
-rw-r--r--arch/frv/kernel/frv_ksyms.c1
-rw-r--r--arch/frv/mm/dma-alloc.c4
-rw-r--r--arch/frv/mm/init.c6
-rw-r--r--arch/h8300/kernel/h8300_ksyms.c1
-rw-r--r--arch/h8300/mm/init.c4
-rw-r--r--arch/i386/kernel/efi.c2
-rw-r--r--arch/i386/kernel/smp.c28
-rw-r--r--arch/i386/kernel/sys_i386.c25
-rw-r--r--arch/i386/kernel/timers/timer_hpet.c2
-rw-r--r--arch/i386/kernel/timers/timer_tsc.c2
-rw-r--r--arch/i386/mm/hugetlbpage.c12
-rw-r--r--arch/i386/mm/init.c6
-rw-r--r--arch/i386/mm/pageattr.c20
-rw-r--r--arch/ia64/Kconfig19
-rw-r--r--arch/ia64/configs/tiger_defconfig2
-rw-r--r--arch/ia64/kernel/acpi.c14
-rw-r--r--arch/ia64/kernel/entry.S14
-rw-r--r--arch/ia64/kernel/iosapic.c6
-rw-r--r--arch/ia64/kernel/irq.c13
-rw-r--r--arch/ia64/kernel/mca.c90
-rw-r--r--arch/ia64/kernel/perfmon.c5
-rw-r--r--arch/ia64/kernel/signal.c101
-rw-r--r--arch/ia64/kernel/smpboot.c114
-rw-r--r--arch/ia64/kernel/time.c9
-rw-r--r--arch/ia64/kernel/topology.c2
-rw-r--r--arch/ia64/mm/contig.c4
-rw-r--r--arch/ia64/mm/discontig.c9
-rw-r--r--arch/ia64/mm/hugetlbpage.c5
-rw-r--r--arch/ia64/mm/init.c6
-rw-r--r--arch/ia64/sn/kernel/Makefile3
-rw-r--r--arch/ia64/sn/kernel/pio_phys.S71
-rw-r--r--arch/ia64/sn/kernel/setup.c6
-rw-r--r--arch/ia64/sn/kernel/sn2/sn2_smp.c21
-rw-r--r--arch/ia64/sn/kernel/xpc_channel.c102
-rw-r--r--arch/ia64/sn/kernel/xpc_main.c1
-rw-r--r--arch/ia64/sn/kernel/xpc_partition.c28
-rw-r--r--arch/ia64/sn/pci/tioce_provider.c326
-rw-r--r--arch/m32r/mm/init.c4
-rw-r--r--arch/m68k/mm/init.c2
-rw-r--r--arch/m68k/mm/memory.c2
-rw-r--r--arch/m68k/mm/motorola.c2
-rw-r--r--arch/m68knommu/kernel/m68k_ksyms.c1
-rw-r--r--arch/m68knommu/mm/init.c4
-rw-r--r--arch/mips/arc/memory.c2
-rw-r--r--arch/mips/dec/prom/memory.c2
-rw-r--r--arch/mips/mips-boards/generic/memory.c2
-rw-r--r--arch/mips/mips-boards/sim/sim_mem.c2
-rw-r--r--arch/mips/mm/init.c11
-rw-r--r--arch/mips/sgi-ip27/ip27-memory.c2
-rw-r--r--arch/parisc/mm/init.c4
-rw-r--r--arch/powerpc/mm/hugetlbpage.c15
-rw-r--r--arch/powerpc/mm/init_32.c4
-rw-r--r--arch/powerpc/mm/init_64.c4
-rw-r--r--arch/powerpc/mm/mem.c6
-rw-r--r--arch/powerpc/platforms/cell/setup.c2
-rw-r--r--arch/ppc/kernel/dma-mapping.c4
-rw-r--r--arch/ppc/mm/init.c6
-rw-r--r--arch/s390/mm/init.c4
-rw-r--r--arch/sh/mm/consistent.c3
-rw-r--r--arch/sh/mm/hugetlbpage.c12
-rw-r--r--arch/sh/mm/init.c4
-rw-r--r--arch/sh64/mm/hugetlbpage.c12
-rw-r--r--arch/sh64/mm/init.c4
-rw-r--r--arch/sparc/kernel/sun4d_smp.c6
-rw-r--r--arch/sparc/kernel/sun4m_smp.c6
-rw-r--r--arch/sparc/mm/init.c6
-rw-r--r--arch/sparc64/mm/hugetlbpage.c12
-rw-r--r--arch/sparc64/mm/init.c4
-rw-r--r--arch/um/kernel/mem.c4
-rw-r--r--arch/um/kernel/physmem.c2
-rw-r--r--arch/x86_64/kernel/time.c2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c1
-rw-r--r--arch/x86_64/mm/init.c6
-rw-r--r--arch/x86_64/mm/pageattr.c63
-rw-r--r--arch/xtensa/mm/init.c2
-rw-r--r--arch/xtensa/mm/pgtable.c24
-rw-r--r--drivers/char/snsc.h5
-rw-r--r--drivers/char/snsc_event.c32
-rw-r--r--drivers/char/tb0219.c24
-rw-r--r--drivers/char/vr41xx_giu.c19
-rw-r--r--drivers/char/vr41xx_rtc.c30
-rw-r--r--drivers/char/watchdog/mv64x60_wdt.c20
-rw-r--r--drivers/firmware/dcdbas.c110
-rw-r--r--drivers/md/dm.c43
-rw-r--r--drivers/media/dvb/bt8xx/Makefile2
-rw-r--r--drivers/net/mv643xx_eth.h18
-rw-r--r--drivers/net/pcnet32.c4143
-rw-r--r--drivers/net/skfp/fplustm.c12
-rw-r--r--drivers/net/skge.c275
-rw-r--r--drivers/net/skge.h1
-rw-r--r--drivers/net/sky2.c583
-rw-r--r--drivers/net/sky2.h22
-rw-r--r--drivers/net/smc91x.c53
-rw-r--r--drivers/net/smc91x.h474
-rw-r--r--drivers/scsi/iscsi_tcp.c2
-rw-r--r--drivers/scsi/sg.c37
-rw-r--r--drivers/serial/Kconfig2
-rw-r--r--drivers/serial/serial_txx9.c77
-rw-r--r--drivers/serial/vr41xx_siu.c24
-rw-r--r--drivers/sn/ioc4.c41
-rw-r--r--drivers/video/acornfb.c2
-rw-r--r--drivers/video/i810/i810_main.c2
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/buffer.c62
-rw-r--r--fs/hugetlbfs/inode.c92
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/ramfs/file-nommu.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c1
-rw-r--r--include/asm-i386/acpi.h10
-rw-r--r--include/asm-i386/pgtable.h5
-rw-r--r--include/asm-ia64/intel_intrin.h134
-rw-r--r--include/asm-ia64/machvec.h13
-rw-r--r--include/asm-ia64/machvec_sn2.h4
-rw-r--r--include/asm-ia64/mca.h2
-rw-r--r--include/asm-ia64/mutex.h93
-rw-r--r--include/asm-ia64/page.h2
-rw-r--r--include/asm-ia64/pgtable.h5
-rw-r--r--include/asm-ia64/processor.h3
-rw-r--r--include/asm-ia64/signal.h2
-rw-r--r--include/asm-ia64/sn/addrs.h8
-rw-r--r--include/asm-ia64/sn/rw_mmr.h56
-rw-r--r--include/asm-ia64/sn/tioce.h36
-rw-r--r--include/asm-ia64/sn/xpc.h22
-rw-r--r--include/asm-ia64/system.h7
-rw-r--r--include/asm-ia64/thread_info.h12
-rw-r--r--include/asm-powerpc/pgtable.h5
-rw-r--r--include/asm-s390/pgalloc.h7
-rw-r--r--include/asm-sh64/pgalloc.h16
-rw-r--r--include/asm-x86_64/pgtable.h4
-rw-r--r--include/linux/hugetlb.h45
-rw-r--r--include/linux/migrate.h36
-rw-r--r--include/linux/mm.h48
-rw-r--r--include/linux/mm_inline.h2
-rw-r--r--include/linux/page-flags.h24
-rw-r--r--include/linux/rtc.h4
-rw-r--r--include/linux/slab.h3
-rw-r--r--include/linux/smp.h23
-rw-r--r--include/linux/swap.h38
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/sched.c6
-rw-r--r--kernel/softirq.c20
-rw-r--r--lib/string.c1
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c286
-rw-r--r--mm/internal.h34
-rw-r--r--mm/memory.c21
-rw-r--r--mm/mempolicy.c117
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/migrate.c655
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c12
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c113
-rw-r--r--mm/readahead.c32
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c890
-rw-r--r--mm/swap.c64
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmscan.c882
-rw-r--r--security/keys/process_keys.c7
-rw-r--r--security/selinux/hooks.c14
-rw-r--r--security/selinux/selinuxfs.c112
-rw-r--r--security/selinux/ss/services.c9
175 files changed, 6839 insertions, 5527 deletions
diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt
index 4ef9f7cd5dc3..944aa55e79f8 100644
--- a/Documentation/networking/e100.txt
+++ b/Documentation/networking/e100.txt
@@ -1,16 +1,17 @@
1Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters 1Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters
2============================================================== 2==============================================================
3 3
4November 17, 2004 4November 15, 2005
5
6 5
7Contents 6Contents
8======== 7========
9 8
10- In This Release 9- In This Release
11- Identifying Your Adapter 10- Identifying Your Adapter
11- Building and Installation
12- Driver Configuration Parameters 12- Driver Configuration Parameters
13- Additional Configurations 13- Additional Configurations
14- Known Issues
14- Support 15- Support
15 16
16 17
@@ -18,18 +19,30 @@ In This Release
18=============== 19===============
19 20
20This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of 21This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of
21Adapters, version 3.3.x. This driver supports 2.4.x and 2.6.x kernels. 22Adapters. This driver includes support for Itanium(R)2-based systems.
23
24For questions related to hardware requirements, refer to the documentation
25supplied with your Intel PRO/100 adapter.
26
27The following features are now available in supported kernels:
28 - Native VLANs
29 - Channel Bonding (teaming)
30 - SNMP
31
32Channel Bonding documentation can be found in the Linux kernel source:
33/Documentation/networking/bonding.txt
34
22 35
23Identifying Your Adapter 36Identifying Your Adapter
24======================== 37========================
25 38
26For more information on how to identify your adapter, go to the Adapter & 39For more information on how to identify your adapter, go to the Adapter &
27Driver ID Guide at: 40Driver ID Guide at:
28 41
29 http://support.intel.com/support/network/adapter/pro100/21397.htm 42 http://support.intel.com/support/network/adapter/pro100/21397.htm
30 43
31For the latest Intel network drivers for Linux, refer to the following 44For the latest Intel network drivers for Linux, refer to the following
32website. In the search field, enter your adapter name or type, or use the 45website. In the search field, enter your adapter name or type, or use the
33networking link on the left to search for your adapter: 46networking link on the left to search for your adapter:
34 47
35 http://downloadfinder.intel.com/scripts-df/support_intel.asp 48 http://downloadfinder.intel.com/scripts-df/support_intel.asp
@@ -40,73 +53,75 @@ Driver Configuration Parameters
40The default value for each parameter is generally the recommended setting, 53The default value for each parameter is generally the recommended setting,
41unless otherwise noted. 54unless otherwise noted.
42 55
43Rx Descriptors: Number of receive descriptors. A receive descriptor is a data 56Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
44 structure that describes a receive buffer and its attributes to the network 57 structure that describes a receive buffer and its attributes to the network
45 controller. The data in the descriptor is used by the controller to write 58 controller. The data in the descriptor is used by the controller to write
46 data from the controller to host memory. In the 3.0.x driver the valid 59 data from the controller to host memory. In the 3.x.x driver the valid range
47 range for this parameter is 64-256. The default value is 64. This parameter 60 for this parameter is 64-256. The default value is 64. This parameter can be
48 can be changed using the command 61 changed using the command:
49 62
50 ethtool -G eth? rx n, where n is the number of desired rx descriptors. 63 ethtool -G eth? rx n, where n is the number of desired rx descriptors.
51 64
52Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a 65Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
53 data structure that describes a transmit buffer and its attributes to the 66 structure that describes a transmit buffer and its attributes to the network
54 network controller. The data in the descriptor is used by the controller to 67 controller. The data in the descriptor is used by the controller to read
55 read data from the host memory to the controller. In the 3.0.x driver the 68 data from the host memory to the controller. In the 3.x.x driver the valid
56 valid range for this parameter is 64-256. The default value is 64. This 69 range for this parameter is 64-256. The default value is 64. This parameter
57 parameter can be changed using the command 70 can be changed using the command:
58 71
59 ethtool -G eth? tx n, where n is the number of desired tx descriptors. 72 ethtool -G eth? tx n, where n is the number of desired tx descriptors.
60 73
61Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by 74Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
62 default. Ethtool can be used as follows to force speed/duplex. 75 default. Ethtool can be used as follows to force speed/duplex.
63 76
64 ethtool -s eth? autoneg off speed {10|100} duplex {full|half} 77 ethtool -s eth? autoneg off speed {10|100} duplex {full|half}
65 78
66 NOTE: setting the speed/duplex to incorrect values will cause the link to 79 NOTE: setting the speed/duplex to incorrect values will cause the link to
67 fail. 80 fail.
68 81
69Event Log Message Level: The driver uses the message level flag to log events 82Event Log Message Level: The driver uses the message level flag to log events
70 to syslog. The message level can be set at driver load time. It can also be 83 to syslog. The message level can be set at driver load time. It can also be
71 set using the command 84 set using the command:
72 85
73 ethtool -s eth? msglvl n 86 ethtool -s eth? msglvl n
74 87
88
75Additional Configurations 89Additional Configurations
76========================= 90=========================
77 91
78 Configuring the Driver on Different Distributions 92 Configuring the Driver on Different Distributions
79 ------------------------------------------------- 93 -------------------------------------------------
80 94
81 Configuring a network driver to load properly when the system is started is 95 Configuring a network driver to load properly when the system is started is
82 distribution dependent. Typically, the configuration process involves adding 96 distribution dependent. Typically, the configuration process involves adding
83 an alias line to /etc/modules.conf as well as editing other system startup 97 an alias line to /etc/modules.conf or /etc/modprobe.conf as well as editing
84 scripts and/or configuration files. Many popular Linux distributions ship 98 other system startup scripts and/or configuration files. Many popular Linux
85 with tools to make these changes for you. To learn the proper way to 99 distributions ship with tools to make these changes for you. To learn the
86 configure a network device for your system, refer to your distribution 100 proper way to configure a network device for your system, refer to your
87 documentation. If during this process you are asked for the driver or module 101 distribution documentation. If during this process you are asked for the
88 name, the name for the Linux Base Driver for the Intel PRO/100 Family of 102 driver or module name, the name for the Linux Base Driver for the Intel
89 Adapters is e100. 103 PRO/100 Family of Adapters is e100.
90 104
91 As an example, if you install the e100 driver for two PRO/100 adapters 105 As an example, if you install the e100 driver for two PRO/100 adapters
92 (eth0 and eth1), add the following to modules.conf: 106 (eth0 and eth1), add the following to modules.conf or modprobe.conf:
93 107
94 alias eth0 e100 108 alias eth0 e100
95 alias eth1 e100 109 alias eth1 e100
96 110
97 Viewing Link Messages 111 Viewing Link Messages
98 --------------------- 112 ---------------------
99 In order to see link messages and other Intel driver information on your 113 In order to see link messages and other Intel driver information on your
100 console, you must set the dmesg level up to six. This can be done by 114 console, you must set the dmesg level up to six. This can be done by
101 entering the following on the command line before loading the e100 driver: 115 entering the following on the command line before loading the e100 driver:
102 116
103 dmesg -n 8 117 dmesg -n 8
104 118
105 If you wish to see all messages issued by the driver, including debug 119 If you wish to see all messages issued by the driver, including debug
106 messages, set the dmesg level to eight. 120 messages, set the dmesg level to eight.
107 121
108 NOTE: This setting is not saved across reboots. 122 NOTE: This setting is not saved across reboots.
109 123
124
110 Ethtool 125 Ethtool
111 ------- 126 -------
112 127
@@ -114,29 +129,27 @@ Additional Configurations
114 diagnostics, as well as displaying statistical information. Ethtool 129 diagnostics, as well as displaying statistical information. Ethtool
115 version 1.6 or later is required for this functionality. 130 version 1.6 or later is required for this functionality.
116 131
117 The latest release of ethtool can be found at: 132 The latest release of ethtool can be found from
118 http://sf.net/projects/gkernel. 133 http://sourceforge.net/projects/gkernel.
119 134
120 NOTE: This driver uses mii support from the kernel. As a result, when 135 NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
121 there is no link, ethtool will report speed/duplex to be 10/half. 136 for a more complete ethtool feature set can be enabled by upgrading
137 ethtool to ethtool-1.8.1.
122 138
123 NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
124 for a more complete ethtool feature set can be enabled by upgrading
125 ethtool to ethtool-1.8.1.
126 139
127 Enabling Wake on LAN* (WoL) 140 Enabling Wake on LAN* (WoL)
128 --------------------------- 141 ---------------------------
129 WoL is provided through the Ethtool* utility. Ethtool is included with Red 142 WoL is provided through the Ethtool* utility. Ethtool is included with Red
130 Hat* 8.0. For other Linux distributions, download and install Ethtool from 143 Hat* 8.0. For other Linux distributions, download and install Ethtool from
131 the following website: http://sourceforge.net/projects/gkernel. 144 the following website: http://sourceforge.net/projects/gkernel.
132 145
133 For instructions on enabling WoL with Ethtool, refer to the Ethtool man 146 For instructions on enabling WoL with Ethtool, refer to the Ethtool man page.
134 page.
135 147
136 WoL will be enabled on the system during the next shut down or reboot. For 148 WoL will be enabled on the system during the next shut down or reboot. For
137 this driver version, in order to enable WoL, the e100 driver must be 149 this driver version, in order to enable WoL, the e100 driver must be
138 loaded when shutting down or rebooting the system. 150 loaded when shutting down or rebooting the system.
139 151
152
140 NAPI 153 NAPI
141 ---- 154 ----
142 155
@@ -144,6 +157,25 @@ Additional Configurations
144 157
145 See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI. 158 See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
146 159
160 Multiple Interfaces on Same Ethernet Broadcast Network
161 ------------------------------------------------------
162
163 Due to the default ARP behavior on Linux, it is not possible to have
164 one system on two IP networks in the same Ethernet broadcast domain
165 (non-partitioned switch) behave as expected. All Ethernet interfaces
166 will respond to IP traffic for any IP address assigned to the system.
167 This results in unbalanced receive traffic.
168
169 If you have multiple interfaces in a server, either turn on ARP
170 filtering by
171
172 (1) entering: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
173 (this only works if your kernel's version is higher than 2.4.5), or
174
175 (2) installing the interfaces in separate broadcast domains (either
176 in different switches or in a switch partitioned to VLANs).
177
178
147Support 179Support
148======= 180=======
149 181
@@ -151,20 +183,24 @@ For general information, go to the Intel support website at:
151 183
152 http://support.intel.com 184 http://support.intel.com
153 185
186 or the Intel Wired Networking project hosted by Sourceforge at:
187
188 http://sourceforge.net/projects/e1000
189
154If an issue is identified with the released source code on the supported 190If an issue is identified with the released source code on the supported
155kernel with a supported adapter, email the specific information related to 191kernel with a supported adapter, email the specific information related to the
156the issue to linux.nics@intel.com. 192issue to e1000-devel@lists.sourceforge.net.
157 193
158 194
159License 195License
160======= 196=======
161 197
162This software program is released under the terms of a license agreement 198This software program is released under the terms of a license agreement
163between you ('Licensee') and Intel. Do not use or load this software or any 199between you ('Licensee') and Intel. Do not use or load this software or any
164associated materials (collectively, the 'Software') until you have carefully 200associated materials (collectively, the 'Software') until you have carefully
165read the full terms and conditions of the LICENSE located in this software 201read the full terms and conditions of the file COPYING located in this software
166package. By loading or using the Software, you agree to the terms of this 202package. By loading or using the Software, you agree to the terms of this
167Agreement. If you do not agree with the terms of this Agreement, do not 203Agreement. If you do not agree with the terms of this Agreement, do not install
168install or use the Software. 204or use the Software.
169 205
170* Other names and brands may be claimed as the property of others. 206* Other names and brands may be claimed as the property of others.
diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt
index 2ebd4058d46d..71fe15af356c 100644
--- a/Documentation/networking/e1000.txt
+++ b/Documentation/networking/e1000.txt
@@ -1,7 +1,7 @@
1Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters 1Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters
2=============================================================== 2===============================================================
3 3
4November 17, 2004 4November 15, 2005
5 5
6 6
7Contents 7Contents
@@ -20,254 +20,316 @@ In This Release
20=============== 20===============
21 21
22This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family 22This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family
23of Adapters, version 5.x.x. 23of Adapters. This driver includes support for Itanium(R)2-based systems.
24 24
25For questions related to hardware requirements, refer to the documentation 25For questions related to hardware requirements, refer to the documentation
26supplied with your Intel PRO/1000 adapter. All hardware requirements listed 26supplied with your Intel PRO/1000 adapter. All hardware requirements listed
27apply to use with Linux. 27apply to use with Linux.
28 28
29Native VLANs are now available with supported kernels. 29The following features are now available in supported kernels:
30 - Native VLANs
31 - Channel Bonding (teaming)
32 - SNMP
33
34Channel Bonding documentation can be found in the Linux kernel source:
35/Documentation/networking/bonding.txt
36
37The driver information previously displayed in the /proc filesystem is not
38supported in this release. Alternatively, you can use ethtool (version 1.6
39or later), lspci, and ifconfig to obtain the same information.
40
41Instructions on updating ethtool can be found in the section "Additional
42Configurations" later in this document.
43
30 44
31Identifying Your Adapter 45Identifying Your Adapter
32======================== 46========================
33 47
34For more information on how to identify your adapter, go to the Adapter & 48For more information on how to identify your adapter, go to the Adapter &
35Driver ID Guide at: 49Driver ID Guide at:
36 50
37 http://support.intel.com/support/network/adapter/pro100/21397.htm 51 http://support.intel.com/support/network/adapter/pro100/21397.htm
38 52
39For the latest Intel network drivers for Linux, refer to the following 53For the latest Intel network drivers for Linux, refer to the following
40website. In the search field, enter your adapter name or type, or use the 54website. In the search field, enter your adapter name or type, or use the
41networking link on the left to search for your adapter: 55networking link on the left to search for your adapter:
42 56
43 http://downloadfinder.intel.com/scripts-df/support_intel.asp 57 http://downloadfinder.intel.com/scripts-df/support_intel.asp
44 58
45Command Line Parameters
46=======================
47 59
48If the driver is built as a module, the following optional parameters are 60Command Line Parameters =======================
49used by entering them on the command line with the modprobe or insmod command 61
50using this syntax: 62If the driver is built as a module, the following optional parameters
63are used by entering them on the command line with the modprobe or insmod
64command using this syntax:
51 65
52 modprobe e1000 [<option>=<VAL1>,<VAL2>,...] 66 modprobe e1000 [<option>=<VAL1>,<VAL2>,...]
53 67
54 insmod e1000 [<option>=<VAL1>,<VAL2>,...] 68 insmod e1000 [<option>=<VAL1>,<VAL2>,...]
55 69
56For example, with two PRO/1000 PCI adapters, entering: 70For example, with two PRO/1000 PCI adapters, entering:
57 71
58 insmod e1000 TxDescriptors=80,128 72 insmod e1000 TxDescriptors=80,128
59 73
60loads the e1000 driver with 80 TX descriptors for the first adapter and 128 TX 74loads the e1000 driver with 80 TX descriptors for the first adapter and 128
61descriptors for the second adapter. 75TX descriptors for the second adapter.
62 76
63The default value for each parameter is generally the recommended setting, 77The default value for each parameter is generally the recommended setting,
64unless otherwise noted. Also, if the driver is statically built into the 78unless otherwise noted.
65kernel, the driver is loaded with the default values for all the parameters. 79
66Ethtool can be used to change some of the parameters at runtime. 80NOTES: For more information about the AutoNeg, Duplex, and Speed
81 parameters, see the "Speed and Duplex Configuration" section in
82 this document.
67 83
68 NOTES: For more information about the AutoNeg, Duplex, and Speed 84 For more information about the InterruptThrottleRate,
69 parameters, see the "Speed and Duplex Configuration" section in 85 RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay
70 this document. 86 parameters, see the application note at:
87 http://www.intel.com/design/network/applnots/ap450.htm
71 88
72 For more information about the InterruptThrottleRate, RxIntDelay, 89 A descriptor describes a data buffer and attributes related to
73 TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay parameters, see the 90 the data buffer. This information is accessed by the hardware.
74 application note at:
75 http://www.intel.com/design/network/applnots/ap450.htm
76 91
77 A descriptor describes a data buffer and attributes related to the
78 data buffer. This information is accessed by the hardware.
79 92
80AutoNeg (adapters using copper connections only) 93AutoNeg
81Valid Range: 0x01-0x0F, 0x20-0x2F 94-------
95(Supported only on adapters with copper connections)
96Valid Range: 0x01-0x0F, 0x20-0x2F
82Default Value: 0x2F 97Default Value: 0x2F
83 This parameter is a bit mask that specifies which speed and duplex 98
84 settings the board advertises. When this parameter is used, the Speed and 99This parameter is a bit mask that specifies which speed and duplex
85 Duplex parameters must not be specified. 100settings the board advertises. When this parameter is used, the Speed
86 NOTE: Refer to the Speed and Duplex section of this readme for more 101and Duplex parameters must not be specified.
87 information on the AutoNeg parameter. 102
88 103NOTE: Refer to the Speed and Duplex section of this readme for more
89Duplex (adapters using copper connections only) 104 information on the AutoNeg parameter.
90Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full) 105
106
107Duplex
108------
109(Supported only on adapters with copper connections)
110Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full)
91Default Value: 0 111Default Value: 0
92 Defines the direction in which data is allowed to flow. Can be either one 112
93 or two-directional. If both Duplex and the link partner are set to auto- 113Defines the direction in which data is allowed to flow. Can be either
94 negotiate, the board auto-detects the correct duplex. If the link partner 114one or two-directional. If both Duplex and the link partner are set to
95 is forced (either full or half), Duplex defaults to half-duplex. 115auto-negotiate, the board auto-detects the correct duplex. If the link
116partner is forced (either full or half), Duplex defaults to half-duplex.
117
96 118
97FlowControl 119FlowControl
98Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx) 120----------
99Default: Read flow control settings from the EEPROM 121Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
100 This parameter controls the automatic generation(Tx) and response(Rx) to 122Default Value: Reads flow control settings from the EEPROM
101 Ethernet PAUSE frames. 123
124This parameter controls the automatic generation(Tx) and response(Rx)
125to Ethernet PAUSE frames.
126
102 127
103InterruptThrottleRate 128InterruptThrottleRate
104Valid Range: 100-100000 (0=off, 1=dynamic) 129---------------------
130(not supported on Intel 82542, 82543 or 82544-based adapters)
131Valid Range: 100-100000 (0=off, 1=dynamic)
105Default Value: 8000 132Default Value: 8000
106 This value represents the maximum number of interrupts per second the 133
107 controller generates. InterruptThrottleRate is another setting used in 134This value represents the maximum number of interrupts per second the
108 interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust 135controller generates. InterruptThrottleRate is another setting used in
109 InterruptThrottleRate based on the current traffic load. 136interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust
110Un-supported Adapters: InterruptThrottleRate is NOT supported by 82542, 82543 137InterruptThrottleRate based on the current traffic load.
111 or 82544-based adapters. 138
112 139NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and
113 NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and 140 RxAbsIntDelay parameters. In other words, minimizing the receive
114 RxAbsIntDelay parameters. In other words, minimizing the receive 141 and/or transmit absolute delays does not force the controller to
115 and/or transmit absolute delays does not force the controller to 142 generate more interrupts than what the Interrupt Throttle Rate
116 generate more interrupts than what the Interrupt Throttle Rate 143 allows.
117 allows. 144
118 CAUTION: If you are using the Intel PRO/1000 CT Network Connection 145CAUTION: If you are using the Intel PRO/1000 CT Network Connection
119 (controller 82547), setting InterruptThrottleRate to a value 146 (controller 82547), setting InterruptThrottleRate to a value
120 greater than 75,000, may hang (stop transmitting) adapters under 147 greater than 75,000, may hang (stop transmitting) adapters
121 certain network conditions. If this occurs a NETDEV WATCHDOG 148 under certain network conditions. If this occurs a NETDEV
122 message is logged in the system event log. In addition, the 149 WATCHDOG message is logged in the system event log. In
123 controller is automatically reset, restoring the network 150 addition, the controller is automatically reset, restoring
124 connection. To eliminate the potential for the hang, ensure 151 the network connection. To eliminate the potential for the
125 that InterruptThrottleRate is set no greater than 75,000 and is 152 hang, ensure that InterruptThrottleRate is set no greater
126 not set to 0. 153 than 75,000 and is not set to 0.
127 NOTE: When e1000 is loaded with default settings and multiple adapters are 154
128 in use simultaneously, the CPU utilization may increase non-linearly. 155NOTE: When e1000 is loaded with default settings and multiple adapters
129 In order to limit the CPU utilization without impacting the overall 156 are in use simultaneously, the CPU utilization may increase non-
130 throughput, we recommend that you load the driver as follows: 157 linearly. In order to limit the CPU utilization without impacting
131 158 the overall throughput, we recommend that you load the driver as
132 insmod e1000.o InterruptThrottleRate=3000,3000,3000 159 follows:
133 160
134 This sets the InterruptThrottleRate to 3000 interrupts/sec for the 161 insmod e1000.o InterruptThrottleRate=3000,3000,3000
135 first, second, and third instances of the driver. The range of 2000 to 162
136 3000 interrupts per second works on a majority of systems and is a 163 This sets the InterruptThrottleRate to 3000 interrupts/sec for
137 good starting point, but the optimal value will be platform-specific. 164 the first, second, and third instances of the driver. The range
138 If CPU utilization is not a concern, use RX_POLLING (NAPI) and default 165 of 2000 to 3000 interrupts per second works on a majority of
139 driver settings. 166 systems and is a good starting point, but the optimal value will
167 be platform-specific. If CPU utilization is not a concern, use
168 RX_POLLING (NAPI) and default driver settings.
169
140 170
141RxDescriptors 171RxDescriptors
142Valid Range: 80-256 for 82542 and 82543-based adapters 172-------------
143 80-4096 for all other supported adapters 173Valid Range: 80-256 for 82542 and 82543-based adapters
174 80-4096 for all other supported adapters
144Default Value: 256 175Default Value: 256
145 This value is the number of receive descriptors allocated by the driver.
146 Increasing this value allows the driver to buffer more incoming packets.
147 Each descriptor is 16 bytes. A receive buffer is allocated for each
148 descriptor and can either be 2048 or 4096 bytes long, depending on the MTU
149 176
150 setting. An incoming packet can span one or more receive descriptors. 177This value specifies the number of receive descriptors allocated by the
151 The maximum MTU size is 16110. 178driver. Increasing this value allows the driver to buffer more incoming
179packets. Each descriptor is 16 bytes. A receive buffer is also
180allocated for each descriptor and is 2048.
152 181
153 NOTE: MTU designates the frame size. It only needs to be set for Jumbo
154 Frames.
155 NOTE: Depending on the available system resources, the request for a
156 higher number of receive descriptors may be denied. In this case,
157 use a lower number.
158 182
159RxIntDelay 183RxIntDelay
160Valid Range: 0-65535 (0=off) 184----------
185Valid Range: 0-65535 (0=off)
161Default Value: 0 186Default Value: 0
162 This value delays the generation of receive interrupts in units of 1.024 187
163 microseconds. Receive interrupt reduction can improve CPU efficiency if 188This value delays the generation of receive interrupts in units of 1.024
164 properly tuned for specific network traffic. Increasing this value adds 189microseconds. Receive interrupt reduction can improve CPU efficiency if
165 extra latency to frame reception and can end up decreasing the throughput 190properly tuned for specific network traffic. Increasing this value adds
166 of TCP traffic. If the system is reporting dropped receives, this value 191extra latency to frame reception and can end up decreasing the throughput
167 may be set too high, causing the driver to run out of available receive 192of TCP traffic. If the system is reporting dropped receives, this value
168 descriptors. 193may be set too high, causing the driver to run out of available receive
169 194descriptors.
170 CAUTION: When setting RxIntDelay to a value other than 0, adapters may 195
171 hang (stop transmitting) under certain network conditions. If 196CAUTION: When setting RxIntDelay to a value other than 0, adapters may
172 this occurs a NETDEV WATCHDOG message is logged in the system 197 hang (stop transmitting) under certain network conditions. If
173 event log. In addition, the controller is automatically reset, 198 this occurs a NETDEV WATCHDOG message is logged in the system
174 restoring the network connection. To eliminate the potential for 199 event log. In addition, the controller is automatically reset,
175 the hang ensure that RxIntDelay is set to 0. 200 restoring the network connection. To eliminate the potential
176 201 for the hang ensure that RxIntDelay is set to 0.
177RxAbsIntDelay (82540, 82545 and later adapters only) 202
178Valid Range: 0-65535 (0=off) 203
204RxAbsIntDelay
205-------------
206(This parameter is supported only on 82540, 82545 and later adapters.)
207Valid Range: 0-65535 (0=off)
179Default Value: 128 208Default Value: 128
180 This value, in units of 1.024 microseconds, limits the delay in which a 209
181 receive interrupt is generated. Useful only if RxIntDelay is non-zero, 210This value, in units of 1.024 microseconds, limits the delay in which a
182 this value ensures that an interrupt is generated after the initial 211receive interrupt is generated. Useful only if RxIntDelay is non-zero,
183 packet is received within the set amount of time. Proper tuning, 212this value ensures that an interrupt is generated after the initial
184 along with RxIntDelay, may improve traffic throughput in specific network 213packet is received within the set amount of time. Proper tuning,
185 conditions. 214along with RxIntDelay, may improve traffic throughput in specific network
186 215conditions.
187Speed (adapters using copper connections only) 216
217
218Speed
219-----
220(This parameter is supported only on adapters with copper connections.)
188Valid Settings: 0, 10, 100, 1000 221Valid Settings: 0, 10, 100, 1000
189Default Value: 0 (auto-negotiate at all supported speeds) 222Default Value: 0 (auto-negotiate at all supported speeds)
190 Speed forces the line speed to the specified value in megabits per second 223
191 (Mbps). If this parameter is not specified or is set to 0 and the link 224Speed forces the line speed to the specified value in megabits per second
192 partner is set to auto-negotiate, the board will auto-detect the correct 225(Mbps). If this parameter is not specified or is set to 0 and the link
193 speed. Duplex should also be set when Speed is set to either 10 or 100. 226partner is set to auto-negotiate, the board will auto-detect the correct
227speed. Duplex should also be set when Speed is set to either 10 or 100.
228
194 229
195TxDescriptors 230TxDescriptors
196Valid Range: 80-256 for 82542 and 82543-based adapters 231-------------
197 80-4096 for all other supported adapters 232Valid Range: 80-256 for 82542 and 82543-based adapters
233 80-4096 for all other supported adapters
198Default Value: 256 234Default Value: 256
199 This value is the number of transmit descriptors allocated by the driver.
200 Increasing this value allows the driver to queue more transmits. Each
201 descriptor is 16 bytes.
202 235
203 NOTE: Depending on the available system resources, the request for a 236This value is the number of transmit descriptors allocated by the driver.
204 higher number of transmit descriptors may be denied. In this case, 237Increasing this value allows the driver to queue more transmits. Each
205 use a lower number. 238descriptor is 16 bytes.
239
240NOTE: Depending on the available system resources, the request for a
241 higher number of transmit descriptors may be denied. In this case,
242 use a lower number.
243
206 244
207TxIntDelay 245TxIntDelay
208Valid Range: 0-65535 (0=off) 246----------
247Valid Range: 0-65535 (0=off)
209Default Value: 64 248Default Value: 64
210 This value delays the generation of transmit interrupts in units of 249
211 1.024 microseconds. Transmit interrupt reduction can improve CPU 250This value delays the generation of transmit interrupts in units of
212 efficiency if properly tuned for specific network traffic. If the 2511.024 microseconds. Transmit interrupt reduction can improve CPU
213 system is reporting dropped transmits, this value may be set too high 252efficiency if properly tuned for specific network traffic. If the
214 causing the driver to run out of available transmit descriptors. 253system is reporting dropped transmits, this value may be set too high
215 254causing the driver to run out of available transmit descriptors.
216TxAbsIntDelay (82540, 82545 and later adapters only) 255
217Valid Range: 0-65535 (0=off) 256
257TxAbsIntDelay
258-------------
259(This parameter is supported only on 82540, 82545 and later adapters.)
260Valid Range: 0-65535 (0=off)
218Default Value: 64 261Default Value: 64
219 This value, in units of 1.024 microseconds, limits the delay in which a 262
220 transmit interrupt is generated. Useful only if TxIntDelay is non-zero, 263This value, in units of 1.024 microseconds, limits the delay in which a
221 this value ensures that an interrupt is generated after the initial 264transmit interrupt is generated. Useful only if TxIntDelay is non-zero,
222 packet is sent on the wire within the set amount of time. Proper tuning, 265this value ensures that an interrupt is generated after the initial
223 along with TxIntDelay, may improve traffic throughput in specific 266packet is sent on the wire within the set amount of time. Proper tuning,
224 network conditions. 267along with TxIntDelay, may improve traffic throughput in specific
225 268network conditions.
226XsumRX (not available on the 82542-based adapter) 269
227Valid Range: 0-1 270XsumRX
271------
272(This parameter is NOT supported on the 82542-based adapter.)
273Valid Range: 0-1
228Default Value: 1 274Default Value: 1
229 A value of '1' indicates that the driver should enable IP checksum 275
230 offload for received packets (both UDP and TCP) to the adapter hardware. 276A value of '1' indicates that the driver should enable IP checksum
277offload for received packets (both UDP and TCP) to the adapter hardware.
278
231 279
232Speed and Duplex Configuration 280Speed and Duplex Configuration
233============================== 281==============================
234 282
235Three keywords are used to control the speed and duplex configuration. These 283Three keywords are used to control the speed and duplex configuration.
236keywords are Speed, Duplex, and AutoNeg. 284These keywords are Speed, Duplex, and AutoNeg.
237 285
238If the board uses a fiber interface, these keywords are ignored, and the 286If the board uses a fiber interface, these keywords are ignored, and the
239fiber interface board only links at 1000 Mbps full-duplex. 287fiber interface board only links at 1000 Mbps full-duplex.
240 288
241For copper-based boards, the keywords interact as follows: 289For copper-based boards, the keywords interact as follows:
242 290
243 The default operation is auto-negotiate. The board advertises all supported 291 The default operation is auto-negotiate. The board advertises all
244 speed and duplex combinations, and it links at the highest common speed and 292 supported speed and duplex combinations, and it links at the highest
245 duplex mode IF the link partner is set to auto-negotiate. 293 common speed and duplex mode IF the link partner is set to auto-negotiate.
246 294
247 If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps is 295 If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
248 advertised (The 1000BaseT spec requires auto-negotiation.) 296 is advertised (The 1000BaseT spec requires auto-negotiation.)
249 297
250 If Speed = 10 or 100, then both Speed and Duplex should be set. Auto- 298 If Speed = 10 or 100, then both Speed and Duplex should be set. Auto-
251 negotiation is disabled, and the AutoNeg parameter is ignored. Partner SHOULD 299 negotiation is disabled, and the AutoNeg parameter is ignored. Partner
252 also be forced. 300 SHOULD also be forced.
301
302The AutoNeg parameter is used when more control is required over the
303auto-negotiation process. It should be used when you wish to control which
304speed and duplex combinations are advertised during the auto-negotiation
305process.
306
307The parameter may be specified as either a decimal or hexidecimal value as
308determined by the bitmap below.
253 309
254The AutoNeg parameter is used when more control is required over the auto- 310Bit position 7 6 5 4 3 2 1 0
255negotiation process. When this parameter is used, Speed and Duplex parameters 311Decimal Value 128 64 32 16 8 4 2 1
256must not be specified. The following table describes supported values for the 312Hex value 80 40 20 10 8 4 2 1
257AutoNeg parameter: 313Speed (Mbps) N/A N/A 1000 N/A 100 100 10 10
314Duplex Full Full Half Full Half
258 315
259Speed (Mbps) 1000 100 100 10 10 316Some examples of using AutoNeg:
260Duplex Full Full Half Full Half
261Value (in base 16) 0x20 0x08 0x04 0x02 0x01
262 317
263Example: insmod e1000 AutoNeg=0x03, loads e1000 and specifies (10 full duplex, 318 modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half)
26410 half duplex) for negotiation with the peer. 319 modprobe e1000 AutoNeg=1 (Same as above)
320 modprobe e1000 AutoNeg=0x02 (Restricts autonegotiation to 10 Full)
321 modprobe e1000 AutoNeg=0x03 (Restricts autonegotiation to 10 Half or 10 Full)
322 modprobe e1000 AutoNeg=0x04 (Restricts autonegotiation to 100 Half)
323 modprobe e1000 AutoNeg=0x05 (Restricts autonegotiation to 10 Half or 100
324 Half)
325 modprobe e1000 AutoNeg=0x020 (Restricts autonegotiation to 1000 Full)
326 modprobe e1000 AutoNeg=32 (Same as above)
265 327
266Note that setting AutoNeg does not guarantee that the board will link at the 328Note that when this parameter is used, Speed and Duplex must not be specified.
267highest specified speed or duplex mode, but the board will link at the 329
268highest possible speed/duplex of the link partner IF the link partner is also 330If the link partner is forced to a specific speed and duplex, then this
269set to auto-negotiate. If the link partner is forced speed/duplex, the 331parameter should not be used. Instead, use the Speed and Duplex parameters
270adapter MUST be forced to the same speed/duplex. 332previously mentioned to force the adapter to the same speed and duplex.
271 333
272 334
273Additional Configurations 335Additional Configurations
@@ -276,19 +338,19 @@ Additional Configurations
276 Configuring the Driver on Different Distributions 338 Configuring the Driver on Different Distributions
277 ------------------------------------------------- 339 -------------------------------------------------
278 340
279 Configuring a network driver to load properly when the system is started is 341 Configuring a network driver to load properly when the system is started
280 distribution dependent. Typically, the configuration process involves adding 342 is distribution dependent. Typically, the configuration process involves
281 an alias line to /etc/modules.conf as well as editing other system startup 343 adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well
282 scripts and/or configuration files. Many popular Linux distributions ship 344 as editing other system startup scripts and/or configuration files. Many
283 with tools to make these changes for you. To learn the proper way to 345 popular Linux distributions ship with tools to make these changes for you.
284 configure a network device for your system, refer to your distribution 346 To learn the proper way to configure a network device for your system,
285 documentation. If during this process you are asked for the driver or module 347 refer to your distribution documentation. If during this process you are
286 name, the name for the Linux Base Driver for the Intel PRO/1000 Family of 348 asked for the driver or module name, the name for the Linux Base Driver
287 Adapters is e1000. 349 for the Intel PRO/1000 Family of Adapters is e1000.
288 350
289 As an example, if you install the e1000 driver for two PRO/1000 adapters 351 As an example, if you install the e1000 driver for two PRO/1000 adapters
290 (eth0 and eth1) and set the speed and duplex to 10full and 100half, add the 352 (eth0 and eth1) and set the speed and duplex to 10full and 100half, add
291 following to modules.conf: 353 the following to modules.conf or or modprobe.conf:
292 354
293 alias eth0 e1000 355 alias eth0 e1000
294 alias eth1 e1000 356 alias eth1 e1000
@@ -297,9 +359,9 @@ Additional Configurations
297 Viewing Link Messages 359 Viewing Link Messages
298 --------------------- 360 ---------------------
299 361
300 Link messages will not be displayed to the console if the distribution is 362 Link messages will not be displayed to the console if the distribution is
301 restricting system messages. In order to see network driver link messages on 363 restricting system messages. In order to see network driver link messages
302 your console, set dmesg to eight by entering the following: 364 on your console, set dmesg to eight by entering the following:
303 365
304 dmesg -n 8 366 dmesg -n 8
305 367
@@ -308,22 +370,42 @@ Additional Configurations
308 Jumbo Frames 370 Jumbo Frames
309 ------------ 371 ------------
310 372
311 The driver supports Jumbo Frames for all adapters except 82542-based 373 The driver supports Jumbo Frames for all adapters except 82542 and
312 adapters. Jumbo Frames support is enabled by changing the MTU to a value 374 82573-based adapters. Jumbo Frames support is enabled by changing the
313 larger than the default of 1500. Use the ifconfig command to increase the 375 MTU to a value larger than the default of 1500. Use the ifconfig command
314 MTU size. For example: 376 to increase the MTU size. For example:
377
378 ifconfig eth<x> mtu 9000 up
379
380 This setting is not saved across reboots. It can be made permanent if
381 you add:
382
383 MTU=9000
315 384
316 ifconfig ethx mtu 9000 up 385 to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>. This example
386 applies to the Red Hat distributions; other distributions may store this
387 setting in a different location.
317 388
318 The maximum MTU setting for Jumbo Frames is 16110. This value coincides 389 Notes:
319 with the maximum Jumbo Frames size of 16128.
320 390
321 NOTE: Jumbo Frames are supported at 1000 Mbps only. Using Jumbo Frames at 391 - To enable Jumbo Frames, increase the MTU size on the interface beyond
322 10 or 100 Mbps may result in poor performance or loss of link. 392 1500.
393 - The maximum MTU setting for Jumbo Frames is 16110. This value coincides
394 with the maximum Jumbo Frames size of 16128.
395 - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or
396 loss of link.
397 - Some Intel gigabit adapters that support Jumbo Frames have a frame size
398 limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes.
399 The adapters with this limitation are based on the Intel 82571EB and
400 82572EI controllers, which correspond to these product names:
401 Intel® PRO/1000 PT Dual Port Server Adapter
402 Intel® PRO/1000 PF Dual Port Server Adapter
403 Intel® PRO/1000 PT Server Adapter
404 Intel® PRO/1000 PT Desktop Adapter
405 Intel® PRO/1000 PF Server Adapter
323 406
407 - The Intel PRO/1000 PM Network Connection does not support jumbo frames.
324 408
325 NOTE: MTU designates the frame size. To enable Jumbo Frames, increase the
326 MTU size on the interface beyond 1500.
327 409
328 Ethtool 410 Ethtool
329 ------- 411 -------
@@ -333,32 +415,41 @@ Additional Configurations
333 version 1.6 or later is required for this functionality. 415 version 1.6 or later is required for this functionality.
334 416
335 The latest release of ethtool can be found from 417 The latest release of ethtool can be found from
336 http://sf.net/projects/gkernel. 418 http://sourceforge.net/projects/gkernel.
337 419
338 NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support 420 NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
339 for a more complete ethtool feature set can be enabled by upgrading 421 for a more complete ethtool feature set can be enabled by upgrading
340 ethtool to ethtool-1.8.1. 422 ethtool to ethtool-1.8.1.
341 423
342 Enabling Wake on LAN* (WoL) 424 Enabling Wake on LAN* (WoL)
343 --------------------------- 425 ---------------------------
344 426
345 WoL is configured through the Ethtool* utility. Ethtool is included with 427 WoL is configured through the Ethtool* utility. Ethtool is included with
346 all versions of Red Hat after Red Hat 7.2. For other Linux distributions, 428 all versions of Red Hat after Red Hat 7.2. For other Linux distributions,
347 download and install Ethtool from the following website: 429 download and install Ethtool from the following website:
348 http://sourceforge.net/projects/gkernel. 430 http://sourceforge.net/projects/gkernel.
349 431
350 For instructions on enabling WoL with Ethtool, refer to the website listed 432 For instructions on enabling WoL with Ethtool, refer to the website listed
351 above. 433 above.
352 434
353 WoL will be enabled on the system during the next shut down or reboot. 435 WoL will be enabled on the system during the next shut down or reboot.
354 For this driver version, in order to enable WoL, the e1000 driver must be 436 For this driver version, in order to enable WoL, the e1000 driver must be
355 loaded when shutting down or rebooting the system. 437 loaded when shutting down or rebooting the system.
356 438
357 NAPI 439 NAPI
358 ---- 440 ----
359 441
360 NAPI (Rx polling mode) is supported in the e1000 driver. NAPI is enabled 442 NAPI (Rx polling mode) is supported in the e1000 driver. NAPI is enabled
361 or disabled based on the configuration of the kernel. 443 or disabled based on the configuration of the kernel. To override
444 the default, use the following compile-time flags.
445
446 To enable NAPI, compile the driver module, passing in a configuration option:
447
448 make CFLAGS_EXTRA=-DE1000_NAPI install
449
450 To disable NAPI, compile the driver module, passing in a configuration option:
451
452 make CFLAGS_EXTRA=-DE1000_NO_NAPI install
362 453
363 See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI. 454 See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
364 455
@@ -369,10 +460,85 @@ Known Issues
369 Jumbo Frames System Requirement 460 Jumbo Frames System Requirement
370 ------------------------------- 461 -------------------------------
371 462
372 Memory allocation failures have been observed on Linux systems with 64 MB 463 Memory allocation failures have been observed on Linux systems with 64 MB
373 of RAM or less that are running Jumbo Frames. If you are using Jumbo Frames, 464 of RAM or less that are running Jumbo Frames. If you are using Jumbo
374 your system may require more than the advertised minimum requirement of 64 MB 465 Frames, your system may require more than the advertised minimum
375 of system memory. 466 requirement of 64 MB of system memory.
467
468 Performance Degradation with Jumbo Frames
469 -----------------------------------------
470
471 Degradation in throughput performance may be observed in some Jumbo frames
472 environments. If this is observed, increasing the application's socket
473 buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values
474 may help. See the specific application manual and
475 /usr/src/linux*/Documentation/
476 networking/ip-sysctl.txt for more details.
477
478 Jumbo frames on Foundry BigIron 8000 switch
479 -------------------------------------------
480 There is a known issue using Jumbo frames when connected to a Foundry
481 BigIron 8000 switch. This is a 3rd party limitation. If you experience
482 loss of packets, lower the MTU size.
483
484 Multiple Interfaces on Same Ethernet Broadcast Network
485 ------------------------------------------------------
486
487 Due to the default ARP behavior on Linux, it is not possible to have
488 one system on two IP networks in the same Ethernet broadcast domain
489 (non-partitioned switch) behave as expected. All Ethernet interfaces
490 will respond to IP traffic for any IP address assigned to the system.
491 This results in unbalanced receive traffic.
492
493 If you have multiple interfaces in a server, either turn on ARP
494 filtering by entering:
495
496 echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
497 (this only works if your kernel's version is higher than 2.4.5),
498
499 NOTE: This setting is not saved across reboots. The configuration
500 change can be made permanent by adding the line:
501 net.ipv4.conf.all.arp_filter = 1
502 to the file /etc/sysctl.conf
503
504 or,
505
506 install the interfaces in separate broadcast domains (either in
507 different switches or in a switch partitioned to VLANs).
508
509 82541/82547 can't link or are slow to link with some link partners
510 -----------------------------------------------------------------
511
512 There is a known compatibility issue with 82541/82547 and some
513 low-end switches where the link will not be established, or will
514 be slow to establish. In particular, these switches are known to
515 be incompatible with 82541/82547:
516
517 Planex FXG-08TE
518 I-O Data ETG-SH8
519
520 To workaround this issue, the driver can be compiled with an override
521 of the PHY's master/slave setting. Forcing master or forcing slave
522 mode will improve time-to-link.
523
524 # make EXTRA_CFLAGS=-DE1000_MASTER_SLAVE=<n>
525
526 Where <n> is:
527
528 0 = Hardware default
529 1 = Master mode
530 2 = Slave mode
531 3 = Auto master/slave
532
533 Disable rx flow control with ethtool
534 ------------------------------------
535
536 In order to disable receive flow control using ethtool, you must turn
537 off auto-negotiation on the same command line.
538
539 For example:
540
541 ethtool -A eth? autoneg off rx off
376 542
377 543
378Support 544Support
@@ -382,20 +548,24 @@ For general information, go to the Intel support website at:
382 548
383 http://support.intel.com 549 http://support.intel.com
384 550
551 or the Intel Wired Networking project hosted by Sourceforge at:
552
553 http://sourceforge.net/projects/e1000
554
385If an issue is identified with the released source code on the supported 555If an issue is identified with the released source code on the supported
386kernel with a supported adapter, email the specific information related to 556kernel with a supported adapter, email the specific information related
387the issue to linux.nics@intel.com. 557to the issue to e1000-devel@lists.sourceforge.net
388 558
389 559
390License 560License
391======= 561=======
392 562
393This software program is released under the terms of a license agreement 563This software program is released under the terms of a license agreement
394between you ('Licensee') and Intel. Do not use or load this software or any 564between you ('Licensee') and Intel. Do not use or load this software or any
395associated materials (collectively, the 'Software') until you have carefully 565associated materials (collectively, the 'Software') until you have carefully
396read the full terms and conditions of the LICENSE located in this software 566read the full terms and conditions of the file COPYING located in this software
397package. By loading or using the Software, you agree to the terms of this 567package. By loading or using the Software, you agree to the terms of this
398Agreement. If you do not agree with the terms of this Agreement, do not 568Agreement. If you do not agree with the terms of this Agreement, do not
399install or use the Software. 569install or use the Software.
400 570
401* Other names and brands may be claimed as the property of others. 571* Other names and brands may be claimed as the property of others.
diff --git a/MAINTAINERS b/MAINTAINERS
index b0dc75a5e74e..dd1351dc32b8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1349,10 +1349,10 @@ S: Maintained
1349INTEL PRO/100 ETHERNET SUPPORT 1349INTEL PRO/100 ETHERNET SUPPORT
1350P: John Ronciak 1350P: John Ronciak
1351M: john.ronciak@intel.com 1351M: john.ronciak@intel.com
1352P: Ganesh Venkatesan
1353M: ganesh.venkatesan@intel.com
1354P: Jesse Brandeburg 1352P: Jesse Brandeburg
1355M: jesse.brandeburg@intel.com 1353M: jesse.brandeburg@intel.com
1354P: Jeff Kirsher
1355M: jeffrey.t.kirsher@intel.com
1356W: http://sourceforge.net/projects/e1000/ 1356W: http://sourceforge.net/projects/e1000/
1357S: Supported 1357S: Supported
1358 1358
@@ -1361,18 +1361,22 @@ P: Jeb Cramer
1361M: cramerj@intel.com 1361M: cramerj@intel.com
1362P: John Ronciak 1362P: John Ronciak
1363M: john.ronciak@intel.com 1363M: john.ronciak@intel.com
1364P: Ganesh Venkatesan 1364P: Jesse Brandeburg
1365M: ganesh.venkatesan@intel.com 1365M: jesse.brandeburg@intel.com
1366P: Jeff Kirsher
1367M: jeffrey.t.kirsher@intel.com
1366W: http://sourceforge.net/projects/e1000/ 1368W: http://sourceforge.net/projects/e1000/
1367S: Supported 1369S: Supported
1368 1370
1369INTEL PRO/10GbE SUPPORT 1371INTEL PRO/10GbE SUPPORT
1372P: Jeff Kirsher
1373M: jeffrey.t.kirsher@intel.com
1370P: Ayyappan Veeraiyan 1374P: Ayyappan Veeraiyan
1371M: ayyappan.veeraiyan@intel.com 1375M: ayyappan.veeraiyan@intel.com
1372P: Ganesh Venkatesan
1373M: ganesh.venkatesan@intel.com
1374P: John Ronciak 1376P: John Ronciak
1375M: john.ronciak@intel.com 1377M: john.ronciak@intel.com
1378P: Jesse Brandeburg
1379M: jesse.brandeburg@intel.com
1376W: http://sourceforge.net/projects/e1000/ 1380W: http://sourceforge.net/projects/e1000/
1377S: Supported 1381S: Supported
1378 1382
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 486d7945583d..544ac5dc09eb 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end)
357 void *__start = start; 357 void *__start = start;
358 for (; __start < end; __start += PAGE_SIZE) { 358 for (; __start < end; __start += PAGE_SIZE) {
359 ClearPageReserved(virt_to_page(__start)); 359 ClearPageReserved(virt_to_page(__start));
360 set_page_count(virt_to_page(__start), 1); 360 init_page_count(virt_to_page(__start));
361 free_page((long)__start); 361 free_page((long)__start);
362 totalram_pages++; 362 totalram_pages++;
363 } 363 }
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
index c2ee18d2075e..8a1bfcd50087 100644
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
223 pte = consistent_pte[idx] + off; 223 pte = consistent_pte[idx] + off;
224 c->vm_pages = page; 224 c->vm_pages = page;
225 225
226 split_page(page, order);
227
226 /* 228 /*
227 * Set the "dma handle" 229 * Set the "dma handle"
228 */ 230 */
@@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
231 do { 233 do {
232 BUG_ON(!pte_none(*pte)); 234 BUG_ON(!pte_none(*pte));
233 235
234 set_page_count(page, 1);
235 /* 236 /*
236 * x86 does not mark the pages reserved... 237 * x86 does not mark the pages reserved...
237 */ 238 */
@@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
250 * Free the otherwise unused pages. 251 * Free the otherwise unused pages.
251 */ 252 */
252 while (page < end) { 253 while (page < end) {
253 set_page_count(page, 1);
254 __free_page(page); 254 __free_page(page);
255 page++; 255 page++;
256 } 256 }
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 8b276ee38acf..b0321e943b76 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
531 for (; addr < end; addr += PAGE_SIZE) { 531 for (; addr < end; addr += PAGE_SIZE) {
532 struct page *page = virt_to_page(addr); 532 struct page *page = virt_to_page(addr);
533 ClearPageReserved(page); 533 ClearPageReserved(page);
534 set_page_count(page, 1); 534 init_page_count(page);
535 free_page(addr); 535 free_page(addr);
536 totalram_pages++; 536 totalram_pages++;
537 } 537 }
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index 1f09a9d0fb83..e3ecaa453747 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
324 for (; addr < end; addr += PAGE_SIZE) { 324 for (; addr < end; addr += PAGE_SIZE) {
325 struct page *page = virt_to_page(addr); 325 struct page *page = virt_to_page(addr);
326 ClearPageReserved(page); 326 ClearPageReserved(page);
327 set_page_count(page, 1); 327 init_page_count(page);
328 free_page(addr); 328 free_page(addr);
329 totalram_pages++; 329 totalram_pages++;
330 } 330 }
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index 31a0018b525a..b7842ff213a6 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -216,7 +216,7 @@ free_initmem(void)
216 addr = (unsigned long)(&__init_begin); 216 addr = (unsigned long)(&__init_begin);
217 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 217 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
218 ClearPageReserved(virt_to_page(addr)); 218 ClearPageReserved(virt_to_page(addr));
219 set_page_count(virt_to_page(addr), 1); 219 init_page_count(virt_to_page(addr));
220 free_page(addr); 220 free_page(addr);
221 totalram_pages++; 221 totalram_pages++;
222 } 222 }
diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c
index 0f1c6cbc4f50..aa6b7d0a2109 100644
--- a/arch/frv/kernel/frv_ksyms.c
+++ b/arch/frv/kernel/frv_ksyms.c
@@ -27,6 +27,7 @@ EXPORT_SYMBOL(__ioremap);
27EXPORT_SYMBOL(iounmap); 27EXPORT_SYMBOL(iounmap);
28 28
29EXPORT_SYMBOL(strnlen); 29EXPORT_SYMBOL(strnlen);
30EXPORT_SYMBOL(strpbrk);
30EXPORT_SYMBOL(strrchr); 31EXPORT_SYMBOL(strrchr);
31EXPORT_SYMBOL(strstr); 32EXPORT_SYMBOL(strstr);
32EXPORT_SYMBOL(strchr); 33EXPORT_SYMBOL(strchr);
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index 342823aad758..636b2f8b5d98 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle)
115 */ 115 */
116 if (order > 0) { 116 if (order > 0) {
117 struct page *rpage = virt_to_page(page); 117 struct page *rpage = virt_to_page(page);
118 118 split_page(rpage, order);
119 for (i = 1; i < (1 << order); i++)
120 set_page_count(rpage + i, 1);
121 } 119 }
122 120
123 err = 0; 121 err = 0;
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 765088ea8a50..8899aa1a4f06 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -169,7 +169,7 @@ void __init mem_init(void)
169 struct page *page = &mem_map[pfn]; 169 struct page *page = &mem_map[pfn];
170 170
171 ClearPageReserved(page); 171 ClearPageReserved(page);
172 set_page_count(page, 1); 172 init_page_count(page);
173 __free_page(page); 173 __free_page(page);
174 totalram_pages++; 174 totalram_pages++;
175 } 175 }
@@ -210,7 +210,7 @@ void __init free_initmem(void)
210 /* next to check that the page we free is not a partial page */ 210 /* next to check that the page we free is not a partial page */
211 for (addr = start; addr < end; addr += PAGE_SIZE) { 211 for (addr = start; addr < end; addr += PAGE_SIZE) {
212 ClearPageReserved(virt_to_page(addr)); 212 ClearPageReserved(virt_to_page(addr));
213 set_page_count(virt_to_page(addr), 1); 213 init_page_count(virt_to_page(addr));
214 free_page(addr); 214 free_page(addr);
215 totalram_pages++; 215 totalram_pages++;
216 } 216 }
@@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
230 int pages = 0; 230 int pages = 0;
231 for (; start < end; start += PAGE_SIZE) { 231 for (; start < end; start += PAGE_SIZE) {
232 ClearPageReserved(virt_to_page(start)); 232 ClearPageReserved(virt_to_page(start));
233 set_page_count(virt_to_page(start), 1); 233 init_page_count(virt_to_page(start));
234 free_page(start); 234 free_page(start);
235 totalram_pages++; 235 totalram_pages++;
236 pages++; 236 pages++;
diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c
index 5cc76efaf7aa..69d6ad32d56c 100644
--- a/arch/h8300/kernel/h8300_ksyms.c
+++ b/arch/h8300/kernel/h8300_ksyms.c
@@ -25,6 +25,7 @@ extern char h8300_debug_device[];
25/* platform dependent support */ 25/* platform dependent support */
26 26
27EXPORT_SYMBOL(strnlen); 27EXPORT_SYMBOL(strnlen);
28EXPORT_SYMBOL(strpbrk);
28EXPORT_SYMBOL(strrchr); 29EXPORT_SYMBOL(strrchr);
29EXPORT_SYMBOL(strstr); 30EXPORT_SYMBOL(strstr);
30EXPORT_SYMBOL(strchr); 31EXPORT_SYMBOL(strchr);
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1e0929ddc8c4..09efc4b1f038 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
196 int pages = 0; 196 int pages = 0;
197 for (; start < end; start += PAGE_SIZE) { 197 for (; start < end; start += PAGE_SIZE) {
198 ClearPageReserved(virt_to_page(start)); 198 ClearPageReserved(virt_to_page(start));
199 set_page_count(virt_to_page(start), 1); 199 init_page_count(virt_to_page(start));
200 free_page(start); 200 free_page(start);
201 totalram_pages++; 201 totalram_pages++;
202 pages++; 202 pages++;
@@ -219,7 +219,7 @@ free_initmem()
219 /* next to check that the page we free is not a partial page */ 219 /* next to check that the page we free is not a partial page */
220 for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { 220 for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
221 ClearPageReserved(virt_to_page(addr)); 221 ClearPageReserved(virt_to_page(addr));
222 set_page_count(virt_to_page(addr), 1); 222 init_page_count(virt_to_page(addr));
223 free_page(addr); 223 free_page(addr);
224 totalram_pages++; 224 totalram_pages++;
225 } 225 }
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index c9cad7ba0d2d..aeabb4196861 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -115,7 +115,7 @@ static void efi_call_phys_epilog(void)
115 unsigned long cr4; 115 unsigned long cr4;
116 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); 116 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
117 117
118 cpu_gdt_descr->address = __va(cpu_gdt_descr->address); 118 cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address);
119 load_gdt(cpu_gdt_descr); 119 load_gdt(cpu_gdt_descr);
120 120
121 cr4 = read_cr4(); 121 cr4 = read_cr4();
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 218d725a5a1e..d134e9643a58 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void)
504 spin_unlock_irq(&call_lock); 504 spin_unlock_irq(&call_lock);
505} 505}
506 506
507static struct call_data_struct * call_data; 507static struct call_data_struct *call_data;
508 508
509/* 509/**
510 * this function sends a 'generic call function' IPI to all other CPUs 510 * smp_call_function(): Run a function on all other CPUs.
511 * in the system. 511 * @func: The function to run. This must be fast and non-blocking.
512 */ 512 * @info: An arbitrary pointer to pass to the function.
513 513 * @nonatomic: currently unused.
514int smp_call_function (void (*func) (void *info), void *info, int nonatomic, 514 * @wait: If true, wait (atomically) until function has completed on other CPUs.
515 int wait) 515 *
516/* 516 * Returns 0 on success, else a negative status code. Does not return until
517 * [SUMMARY] Run a function on all other CPUs.
518 * <func> The function to run. This must be fast and non-blocking.
519 * <info> An arbitrary pointer to pass to the function.
520 * <nonatomic> currently unused.
521 * <wait> If true, wait (atomically) until function has completed on other CPUs.
522 * [RETURNS] 0 on success, else a negative status code. Does not return until
523 * remote CPUs are nearly ready to execute <<func>> or are or have executed. 517 * remote CPUs are nearly ready to execute <<func>> or are or have executed.
524 * 518 *
525 * You must not call this function with disabled interrupts or from a 519 * You must not call this function with disabled interrupts or from a
526 * hardware interrupt handler or from a bottom half handler. 520 * hardware interrupt handler or from a bottom half handler.
527 */ 521 */
522int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
523 int wait)
528{ 524{
529 struct call_data_struct data; 525 struct call_data_struct data;
530 int cpus; 526 int cpus;
diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
index a4a61976ecb9..8fdb1fb17a5f 100644
--- a/arch/i386/kernel/sys_i386.c
+++ b/arch/i386/kernel/sys_i386.c
@@ -40,14 +40,13 @@ asmlinkage int sys_pipe(unsigned long __user * fildes)
40 return error; 40 return error;
41} 41}
42 42
43/* common code for old and new mmaps */ 43asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
44static inline long do_mmap2( 44 unsigned long prot, unsigned long flags,
45 unsigned long addr, unsigned long len, 45 unsigned long fd, unsigned long pgoff)
46 unsigned long prot, unsigned long flags,
47 unsigned long fd, unsigned long pgoff)
48{ 46{
49 int error = -EBADF; 47 int error = -EBADF;
50 struct file * file = NULL; 48 struct file *file = NULL;
49 struct mm_struct *mm = current->mm;
51 50
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 51 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) { 52 if (!(flags & MAP_ANONYMOUS)) {
@@ -56,9 +55,9 @@ static inline long do_mmap2(
56 goto out; 55 goto out;
57 } 56 }
58 57
59 down_write(&current->mm->mmap_sem); 58 down_write(&mm->mmap_sem);
60 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); 59 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
61 up_write(&current->mm->mmap_sem); 60 up_write(&mm->mmap_sem);
62 61
63 if (file) 62 if (file)
64 fput(file); 63 fput(file);
@@ -66,13 +65,6 @@ out:
66 return error; 65 return error;
67} 66}
68 67
69asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
70 unsigned long prot, unsigned long flags,
71 unsigned long fd, unsigned long pgoff)
72{
73 return do_mmap2(addr, len, prot, flags, fd, pgoff);
74}
75
76/* 68/*
77 * Perform the select(nd, in, out, ex, tv) and mmap() system 69 * Perform the select(nd, in, out, ex, tv) and mmap() system
78 * calls. Linux/i386 didn't use to be able to handle more than 70 * calls. Linux/i386 didn't use to be able to handle more than
@@ -101,7 +93,8 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
101 if (a.offset & ~PAGE_MASK) 93 if (a.offset & ~PAGE_MASK)
102 goto out; 94 goto out;
103 95
104 err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); 96 err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
97 a.fd, a.offset >> PAGE_SHIFT);
105out: 98out:
106 return err; 99 return err;
107} 100}
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index be242723c339..17a6fe7166e7 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -46,7 +46,7 @@ static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
46 * 46 *
47 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 47 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
48 */ 48 */
49static unsigned long cyc2ns_scale; 49static unsigned long cyc2ns_scale __read_mostly;
50#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 50#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
51 51
52static inline void set_cyc2ns_scale(unsigned long cpu_khz) 52static inline void set_cyc2ns_scale(unsigned long cpu_khz)
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index a7f5a2aceba2..5e41ee29c8cf 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -74,7 +74,7 @@ late_initcall(start_lost_tick_compensation);
74 * 74 *
75 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 75 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
76 */ 76 */
77static unsigned long cyc2ns_scale; 77static unsigned long cyc2ns_scale __read_mostly;
78#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 78#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
79 79
80static inline void set_cyc2ns_scale(unsigned long cpu_khz) 80static inline void set_cyc2ns_scale(unsigned long cpu_khz)
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index d524127c9afc..a7d891585411 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
48 return (pte_t *) pmd; 48 return (pte_t *) pmd;
49} 49}
50 50
51/*
52 * This function checks for proper alignment of input addr and len parameters.
53 */
54int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
55{
56 if (len & ~HPAGE_MASK)
57 return -EINVAL;
58 if (addr & ~HPAGE_MASK)
59 return -EINVAL;
60 return 0;
61}
62
63#if 0 /* This is just for testing */ 51#if 0 /* This is just for testing */
64struct page * 52struct page *
65follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 53follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 2700f01994ba..7ba55a6e2dbc 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
270 270
271static void __meminit free_new_highpage(struct page *page) 271static void __meminit free_new_highpage(struct page *page)
272{ 272{
273 set_page_count(page, 1); 273 init_page_count(page);
274 __free_page(page); 274 __free_page(page);
275 totalhigh_pages++; 275 totalhigh_pages++;
276} 276}
@@ -727,7 +727,7 @@ void free_initmem(void)
727 addr = (unsigned long)(&__init_begin); 727 addr = (unsigned long)(&__init_begin);
728 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 728 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
729 ClearPageReserved(virt_to_page(addr)); 729 ClearPageReserved(virt_to_page(addr));
730 set_page_count(virt_to_page(addr), 1); 730 init_page_count(virt_to_page(addr));
731 memset((void *)addr, 0xcc, PAGE_SIZE); 731 memset((void *)addr, 0xcc, PAGE_SIZE);
732 free_page(addr); 732 free_page(addr);
733 totalram_pages++; 733 totalram_pages++;
@@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
766 printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 766 printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
767 for (; start < end; start += PAGE_SIZE) { 767 for (; start < end; start += PAGE_SIZE) {
768 ClearPageReserved(virt_to_page(start)); 768 ClearPageReserved(virt_to_page(start));
769 set_page_count(virt_to_page(start), 1); 769 init_page_count(virt_to_page(start));
770 free_page(start); 770 free_page(start);
771 totalram_pages++; 771 totalram_pages++;
772 } 772 }
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index d0cadb33b54c..92c3d9f0e731 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -51,6 +51,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
51 if (!base) 51 if (!base)
52 return NULL; 52 return NULL;
53 53
54 /*
55 * page_private is used to track the number of entries in
56 * the page table page that have non standard attributes.
57 */
58 SetPagePrivate(base);
59 page_private(base) = 0;
60
54 address = __pa(address); 61 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK; 62 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base); 63 pbase = (pte_t *)page_address(base);
@@ -143,11 +150,12 @@ __change_page_attr(struct page *page, pgprot_t prot)
143 return -ENOMEM; 150 return -ENOMEM;
144 set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); 151 set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
145 kpte_page = split; 152 kpte_page = split;
146 } 153 }
147 get_page(kpte_page); 154 page_private(kpte_page)++;
148 } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 155 } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
149 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); 156 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
150 __put_page(kpte_page); 157 BUG_ON(page_private(kpte_page) == 0);
158 page_private(kpte_page)--;
151 } else 159 } else
152 BUG(); 160 BUG();
153 161
@@ -157,10 +165,8 @@ __change_page_attr(struct page *page, pgprot_t prot)
157 * replace it with a largepage. 165 * replace it with a largepage.
158 */ 166 */
159 if (!PageReserved(kpte_page)) { 167 if (!PageReserved(kpte_page)) {
160 /* memleak and potential failed 2M page regeneration */ 168 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
161 BUG_ON(!page_count(kpte_page)); 169 ClearPagePrivate(kpte_page);
162
163 if (cpu_has_pse && (page_count(kpte_page) == 1)) {
164 list_add(&kpte_page->lru, &df_list); 170 list_add(&kpte_page->lru, &df_list);
165 revert_page(kpte_page, address); 171 revert_page(kpte_page, address);
166 } 172 }
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index a85ea9d37f05..ff7ae6b664e8 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -271,6 +271,25 @@ config SCHED_SMT
271 Intel IA64 chips with MultiThreading at a cost of slightly increased 271 Intel IA64 chips with MultiThreading at a cost of slightly increased
272 overhead in some places. If unsure say N here. 272 overhead in some places. If unsure say N here.
273 273
274config PERMIT_BSP_REMOVE
275 bool "Support removal of Bootstrap Processor"
276 depends on HOTPLUG_CPU
277 default n
278 ---help---
279 Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU
280 support.
281
282config FORCE_CPEI_RETARGET
283 bool "Force assumption that CPEI can be re-targetted"
284 depends on PERMIT_BSP_REMOVE
285 default n
286 ---help---
287 Say Y if you need to force the assumption that CPEI can be re-targetted to
288 any cpu in the system. This hint is available via ACPI 3.0 specifications.
289 Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP.
290 This option it useful to enable this feature on older BIOS's as well.
291 You can also enable this by using boot command line option force_cpei=1.
292
274config PREEMPT 293config PREEMPT
275 bool "Preemptible Kernel" 294 bool "Preemptible Kernel"
276 help 295 help
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index 125568118b84..766bf4955432 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -116,6 +116,8 @@ CONFIG_FORCE_MAX_ZONEORDER=17
116CONFIG_SMP=y 116CONFIG_SMP=y
117CONFIG_NR_CPUS=4 117CONFIG_NR_CPUS=4
118CONFIG_HOTPLUG_CPU=y 118CONFIG_HOTPLUG_CPU=y
119CONFIG_PERMIT_BSP_REMOVE=y
120CONFIG_FORCE_CPEI_RETARGET=y
119# CONFIG_SCHED_SMT is not set 121# CONFIG_SCHED_SMT is not set
120# CONFIG_PREEMPT is not set 122# CONFIG_PREEMPT is not set
121CONFIG_SELECT_MEMORY_MODEL=y 123CONFIG_SELECT_MEMORY_MODEL=y
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index ecd44bdc8394..4722ec51c70c 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -284,19 +284,24 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header,
284 return 0; 284 return 0;
285} 285}
286 286
287#ifdef CONFIG_HOTPLUG_CPU
287unsigned int can_cpei_retarget(void) 288unsigned int can_cpei_retarget(void)
288{ 289{
289 extern int cpe_vector; 290 extern int cpe_vector;
291 extern unsigned int force_cpei_retarget;
290 292
291 /* 293 /*
292 * Only if CPEI is supported and the override flag 294 * Only if CPEI is supported and the override flag
293 * is present, otherwise return that its re-targettable 295 * is present, otherwise return that its re-targettable
294 * if we are in polling mode. 296 * if we are in polling mode.
295 */ 297 */
296 if (cpe_vector > 0 && !acpi_cpei_override) 298 if (cpe_vector > 0) {
297 return 0; 299 if (acpi_cpei_override || force_cpei_retarget)
298 else 300 return 1;
299 return 1; 301 else
302 return 0;
303 }
304 return 1;
300} 305}
301 306
302unsigned int is_cpu_cpei_target(unsigned int cpu) 307unsigned int is_cpu_cpei_target(unsigned int cpu)
@@ -315,6 +320,7 @@ void set_cpei_target_cpu(unsigned int cpu)
315{ 320{
316 acpi_cpei_phys_cpuid = cpu_physical_id(cpu); 321 acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
317} 322}
323#endif
318 324
319unsigned int get_cpei_target_cpu(void) 325unsigned int get_cpei_target_cpu(void)
320{ 326{
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 930fdfca6ddb..0e3eda99e549 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1102,9 +1102,6 @@ skip_rbs_switch:
1102 st8 [r2]=r8 1102 st8 [r2]=r8
1103 st8 [r3]=r10 1103 st8 [r3]=r10
1104.work_pending: 1104.work_pending:
1105 tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context?
1106(p6) br.cond.sptk.few .sigdelayed
1107 ;;
1108 tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? 1105 tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0?
1109(p6) br.cond.sptk.few .notify 1106(p6) br.cond.sptk.few .notify
1110#ifdef CONFIG_PREEMPT 1107#ifdef CONFIG_PREEMPT
@@ -1131,17 +1128,6 @@ skip_rbs_switch:
1131(pLvSys)br.cond.sptk.few .work_pending_syscall_end 1128(pLvSys)br.cond.sptk.few .work_pending_syscall_end
1132 br.cond.sptk.many .work_processed_kernel // don't re-check 1129 br.cond.sptk.many .work_processed_kernel // don't re-check
1133 1130
1134// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
1135// it could not be delivered. Deliver it now. The signal might be for us and
1136// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
1137// signal.
1138
1139.sigdelayed:
1140 br.call.sptk.many rp=do_sigdelayed
1141 cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check
1142(pLvSys)br.cond.sptk.few .work_pending_syscall_end
1143 br.cond.sptk.many .work_processed_kernel // re-check
1144
1145.work_pending_syscall_end: 1131.work_pending_syscall_end:
1146 adds r2=PT(R8)+16,r12 1132 adds r2=PT(R8)+16,r12
1147 adds r3=PT(R10)+16,r12 1133 adds r3=PT(R10)+16,r12
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 574084f343fa..8832c553230a 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -631,6 +631,7 @@ get_target_cpu (unsigned int gsi, int vector)
631{ 631{
632#ifdef CONFIG_SMP 632#ifdef CONFIG_SMP
633 static int cpu = -1; 633 static int cpu = -1;
634 extern int cpe_vector;
634 635
635 /* 636 /*
636 * In case of vector shared by multiple RTEs, all RTEs that 637 * In case of vector shared by multiple RTEs, all RTEs that
@@ -653,6 +654,11 @@ get_target_cpu (unsigned int gsi, int vector)
653 if (!cpu_online(smp_processor_id())) 654 if (!cpu_online(smp_processor_id()))
654 return cpu_physical_id(smp_processor_id()); 655 return cpu_physical_id(smp_processor_id());
655 656
657#ifdef CONFIG_ACPI
658 if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR)
659 return get_cpei_target_cpu();
660#endif
661
656#ifdef CONFIG_NUMA 662#ifdef CONFIG_NUMA
657 { 663 {
658 int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0; 664 int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index d33244c32759..5ce908ef9c95 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -163,8 +163,19 @@ void fixup_irqs(void)
163{ 163{
164 unsigned int irq; 164 unsigned int irq;
165 extern void ia64_process_pending_intr(void); 165 extern void ia64_process_pending_intr(void);
166 extern void ia64_disable_timer(void);
167 extern volatile int time_keeper_id;
168
169 ia64_disable_timer();
170
171 /*
172 * Find a new timesync master
173 */
174 if (smp_processor_id() == time_keeper_id) {
175 time_keeper_id = first_cpu(cpu_online_map);
176 printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id);
177 }
166 178
167 ia64_set_itv(1<<16);
168 /* 179 /*
169 * Phase 1: Locate irq's bound to this cpu and 180 * Phase 1: Locate irq's bound to this cpu and
170 * relocate them for cpu removal. 181 * relocate them for cpu removal.
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index ee7eec9ee576..b57e723f194c 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -281,14 +281,10 @@ ia64_mca_log_sal_error_record(int sal_info_type)
281 ia64_sal_clear_state_info(sal_info_type); 281 ia64_sal_clear_state_info(sal_info_type);
282} 282}
283 283
284/*
285 * platform dependent error handling
286 */
287#ifndef PLATFORM_MCA_HANDLERS
288
289#ifdef CONFIG_ACPI 284#ifdef CONFIG_ACPI
290 285
291int cpe_vector = -1; 286int cpe_vector = -1;
287int ia64_cpe_irq = -1;
292 288
293static irqreturn_t 289static irqreturn_t
294ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) 290ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
@@ -377,8 +373,6 @@ ia64_mca_register_cpev (int cpev)
377} 373}
378#endif /* CONFIG_ACPI */ 374#endif /* CONFIG_ACPI */
379 375
380#endif /* PLATFORM_MCA_HANDLERS */
381
382/* 376/*
383 * ia64_mca_cmc_vector_setup 377 * ia64_mca_cmc_vector_setup
384 * 378 *
@@ -630,6 +624,32 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
630 *tnat |= (nat << tslot); 624 *tnat |= (nat << tslot);
631} 625}
632 626
627/* Change the comm field on the MCA/INT task to include the pid that
628 * was interrupted, it makes for easier debugging. If that pid was 0
629 * (swapper or nested MCA/INIT) then use the start of the previous comm
630 * field suffixed with its cpu.
631 */
632
633static void
634ia64_mca_modify_comm(const task_t *previous_current)
635{
636 char *p, comm[sizeof(current->comm)];
637 if (previous_current->pid)
638 snprintf(comm, sizeof(comm), "%s %d",
639 current->comm, previous_current->pid);
640 else {
641 int l;
642 if ((p = strchr(previous_current->comm, ' ')))
643 l = p - previous_current->comm;
644 else
645 l = strlen(previous_current->comm);
646 snprintf(comm, sizeof(comm), "%s %*s %d",
647 current->comm, l, previous_current->comm,
648 task_thread_info(previous_current)->cpu);
649 }
650 memcpy(current->comm, comm, sizeof(current->comm));
651}
652
633/* On entry to this routine, we are running on the per cpu stack, see 653/* On entry to this routine, we are running on the per cpu stack, see
634 * mca_asm.h. The original stack has not been touched by this event. Some of 654 * mca_asm.h. The original stack has not been touched by this event. Some of
635 * the original stack's registers will be in the RBS on this stack. This stack 655 * the original stack's registers will be in the RBS on this stack. This stack
@@ -648,7 +668,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
648 struct ia64_sal_os_state *sos, 668 struct ia64_sal_os_state *sos,
649 const char *type) 669 const char *type)
650{ 670{
651 char *p, comm[sizeof(current->comm)]; 671 char *p;
652 ia64_va va; 672 ia64_va va;
653 extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ 673 extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */
654 const pal_min_state_area_t *ms = sos->pal_min_state; 674 const pal_min_state_area_t *ms = sos->pal_min_state;
@@ -721,6 +741,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
721 /* Verify the previous stack state before we change it */ 741 /* Verify the previous stack state before we change it */
722 if (user_mode(regs)) { 742 if (user_mode(regs)) {
723 msg = "occurred in user space"; 743 msg = "occurred in user space";
744 /* previous_current is guaranteed to be valid when the task was
745 * in user space, so ...
746 */
747 ia64_mca_modify_comm(previous_current);
724 goto no_mod; 748 goto no_mod;
725 } 749 }
726 if (r13 != sos->prev_IA64_KR_CURRENT) { 750 if (r13 != sos->prev_IA64_KR_CURRENT) {
@@ -750,25 +774,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
750 goto no_mod; 774 goto no_mod;
751 } 775 }
752 776
753 /* Change the comm field on the MCA/INT task to include the pid that 777 ia64_mca_modify_comm(previous_current);
754 * was interrupted, it makes for easier debugging. If that pid was 0
755 * (swapper or nested MCA/INIT) then use the start of the previous comm
756 * field suffixed with its cpu.
757 */
758 if (previous_current->pid)
759 snprintf(comm, sizeof(comm), "%s %d",
760 current->comm, previous_current->pid);
761 else {
762 int l;
763 if ((p = strchr(previous_current->comm, ' ')))
764 l = p - previous_current->comm;
765 else
766 l = strlen(previous_current->comm);
767 snprintf(comm, sizeof(comm), "%s %*s %d",
768 current->comm, l, previous_current->comm,
769 task_thread_info(previous_current)->cpu);
770 }
771 memcpy(current->comm, comm, sizeof(current->comm));
772 778
773 /* Make the original task look blocked. First stack a struct pt_regs, 779 /* Make the original task look blocked. First stack a struct pt_regs,
774 * describing the state at the time of interrupt. mca_asm.S built a 780 * describing the state at the time of interrupt. mca_asm.S built a
@@ -908,7 +914,7 @@ no_mod:
908static void 914static void
909ia64_wait_for_slaves(int monarch) 915ia64_wait_for_slaves(int monarch)
910{ 916{
911 int c, wait = 0; 917 int c, wait = 0, missing = 0;
912 for_each_online_cpu(c) { 918 for_each_online_cpu(c) {
913 if (c == monarch) 919 if (c == monarch)
914 continue; 920 continue;
@@ -919,15 +925,32 @@ ia64_wait_for_slaves(int monarch)
919 } 925 }
920 } 926 }
921 if (!wait) 927 if (!wait)
922 return; 928 goto all_in;
923 for_each_online_cpu(c) { 929 for_each_online_cpu(c) {
924 if (c == monarch) 930 if (c == monarch)
925 continue; 931 continue;
926 if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { 932 if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
927 udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */ 933 udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */
934 if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
935 missing = 1;
928 break; 936 break;
929 } 937 }
930 } 938 }
939 if (!missing)
940 goto all_in;
941 printk(KERN_INFO "OS MCA slave did not rendezvous on cpu");
942 for_each_online_cpu(c) {
943 if (c == monarch)
944 continue;
945 if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
946 printk(" %d", c);
947 }
948 printk("\n");
949 return;
950
951all_in:
952 printk(KERN_INFO "All OS MCA slaves have reached rendezvous\n");
953 return;
931} 954}
932 955
933/* 956/*
@@ -953,6 +976,10 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
953 task_t *previous_current; 976 task_t *previous_current;
954 977
955 oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ 978 oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */
979 console_loglevel = 15; /* make sure printks make it to console */
980 printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n",
981 sos->proc_state_param, cpu, sos->monarch);
982
956 previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); 983 previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
957 monarch_cpu = cpu; 984 monarch_cpu = cpu;
958 if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0) 985 if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0)
@@ -1444,11 +1471,13 @@ void __devinit
1444ia64_mca_cpu_init(void *cpu_data) 1471ia64_mca_cpu_init(void *cpu_data)
1445{ 1472{
1446 void *pal_vaddr; 1473 void *pal_vaddr;
1474 static int first_time = 1;
1447 1475
1448 if (smp_processor_id() == 0) { 1476 if (first_time) {
1449 void *mca_data; 1477 void *mca_data;
1450 int cpu; 1478 int cpu;
1451 1479
1480 first_time = 0;
1452 mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu) 1481 mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu)
1453 * NR_CPUS + KERNEL_STACK_SIZE); 1482 * NR_CPUS + KERNEL_STACK_SIZE);
1454 mca_data = (void *)(((unsigned long)mca_data + 1483 mca_data = (void *)(((unsigned long)mca_data +
@@ -1704,6 +1733,7 @@ ia64_mca_late_init(void)
1704 desc = irq_descp(irq); 1733 desc = irq_descp(irq);
1705 desc->status |= IRQ_PER_CPU; 1734 desc->status |= IRQ_PER_CPU;
1706 setup_irq(irq, &mca_cpe_irqaction); 1735 setup_irq(irq, &mca_cpe_irqaction);
1736 ia64_cpe_irq = irq;
1707 } 1737 }
1708 ia64_mca_register_cpev(cpe_vector); 1738 ia64_mca_register_cpev(cpe_vector);
1709 IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__); 1739 IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9c5194b385da..077f21216b65 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6722,6 +6722,7 @@ __initcall(pfm_init);
6722void 6722void
6723pfm_init_percpu (void) 6723pfm_init_percpu (void)
6724{ 6724{
6725 static int first_time=1;
6725 /* 6726 /*
6726 * make sure no measurement is active 6727 * make sure no measurement is active
6727 * (may inherit programmed PMCs from EFI). 6728 * (may inherit programmed PMCs from EFI).
@@ -6734,8 +6735,10 @@ pfm_init_percpu (void)
6734 */ 6735 */
6735 pfm_unfreeze_pmu(); 6736 pfm_unfreeze_pmu();
6736 6737
6737 if (smp_processor_id() == 0) 6738 if (first_time) {
6738 register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); 6739 register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
6740 first_time=0;
6741 }
6739 6742
6740 ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); 6743 ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
6741 ia64_srlz_d(); 6744 ia64_srlz_d();
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 463f6bb44d07..1d7903ee2126 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -588,104 +588,3 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
588 } 588 }
589 return 0; 589 return 0;
590} 590}
591
592/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it
593 * could not be delivered. It is important that the target process is not
594 * allowed to do any more work in user space. Possible cases for the target
595 * process:
596 *
597 * - It is sleeping and will wake up soon. Store the data in the current task,
598 * the signal will be sent when the current task returns from the next
599 * interrupt.
600 *
601 * - It is running in user context. Store the data in the current task, the
602 * signal will be sent when the current task returns from the next interrupt.
603 *
604 * - It is running in kernel context on this or another cpu and will return to
605 * user context. Store the data in the target task, the signal will be sent
606 * to itself when the target task returns to user space.
607 *
608 * - It is running in kernel context on this cpu and will sleep before
609 * returning to user context. Because this is also the current task, the
610 * signal will not get delivered and the task could sleep indefinitely.
611 * Store the data in the idle task for this cpu, the signal will be sent
612 * after the idle task processes its next interrupt.
613 *
614 * To cover all cases, store the data in the target task, the current task and
615 * the idle task on this cpu. Whatever happens, the signal will be delivered
616 * to the target task before it can do any useful user space work. Multiple
617 * deliveries have no unwanted side effects.
618 *
619 * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts
620 * disabled. It must not take any locks nor use kernel structures or services
621 * that require locks.
622 */
623
624/* To ensure that we get the right pid, check its start time. To avoid extra
625 * include files in thread_info.h, convert the task start_time to unsigned long,
626 * giving us a cycle time of > 580 years.
627 */
628static inline unsigned long
629start_time_ul(const struct task_struct *t)
630{
631 return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec;
632}
633
634void
635set_sigdelayed(pid_t pid, int signo, int code, void __user *addr)
636{
637 struct task_struct *t;
638 unsigned long start_time = 0;
639 int i;
640
641 for (i = 1; i <= 3; ++i) {
642 switch (i) {
643 case 1:
644 t = find_task_by_pid(pid);
645 if (t)
646 start_time = start_time_ul(t);
647 break;
648 case 2:
649 t = current;
650 break;
651 default:
652 t = idle_task(smp_processor_id());
653 break;
654 }
655
656 if (!t)
657 return;
658 task_thread_info(t)->sigdelayed.signo = signo;
659 task_thread_info(t)->sigdelayed.code = code;
660 task_thread_info(t)->sigdelayed.addr = addr;
661 task_thread_info(t)->sigdelayed.start_time = start_time;
662 task_thread_info(t)->sigdelayed.pid = pid;
663 wmb();
664 set_tsk_thread_flag(t, TIF_SIGDELAYED);
665 }
666}
667
668/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that
669 * was detected in MCA/INIT/NMI/PMI context where it could not be delivered.
670 */
671
672void
673do_sigdelayed(void)
674{
675 struct siginfo siginfo;
676 pid_t pid;
677 struct task_struct *t;
678
679 clear_thread_flag(TIF_SIGDELAYED);
680 memset(&siginfo, 0, sizeof(siginfo));
681 siginfo.si_signo = current_thread_info()->sigdelayed.signo;
682 siginfo.si_code = current_thread_info()->sigdelayed.code;
683 siginfo.si_addr = current_thread_info()->sigdelayed.addr;
684 pid = current_thread_info()->sigdelayed.pid;
685 t = find_task_by_pid(pid);
686 if (!t)
687 return;
688 if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
689 return;
690 force_sig_info(siginfo.si_signo, &siginfo, t);
691}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index b681ef34a86e..c4b633b36dab 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -70,6 +70,12 @@
70#endif 70#endif
71 71
72#ifdef CONFIG_HOTPLUG_CPU 72#ifdef CONFIG_HOTPLUG_CPU
73#ifdef CONFIG_PERMIT_BSP_REMOVE
74#define bsp_remove_ok 1
75#else
76#define bsp_remove_ok 0
77#endif
78
73/* 79/*
74 * Store all idle threads, this can be reused instead of creating 80 * Store all idle threads, this can be reused instead of creating
75 * a new thread. Also avoids complicated thread destroy functionality 81 * a new thread. Also avoids complicated thread destroy functionality
@@ -104,7 +110,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
104/* 110/*
105 * ITC synchronization related stuff: 111 * ITC synchronization related stuff:
106 */ 112 */
107#define MASTER 0 113#define MASTER (0)
108#define SLAVE (SMP_CACHE_BYTES/8) 114#define SLAVE (SMP_CACHE_BYTES/8)
109 115
110#define NUM_ROUNDS 64 /* magic value */ 116#define NUM_ROUNDS 64 /* magic value */
@@ -151,6 +157,27 @@ char __initdata no_int_routing;
151 157
152unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */ 158unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
153 159
160#ifdef CONFIG_FORCE_CPEI_RETARGET
161#define CPEI_OVERRIDE_DEFAULT (1)
162#else
163#define CPEI_OVERRIDE_DEFAULT (0)
164#endif
165
166unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT;
167
168static int __init
169cmdl_force_cpei(char *str)
170{
171 int value=0;
172
173 get_option (&str, &value);
174 force_cpei_retarget = value;
175
176 return 1;
177}
178
179__setup("force_cpei=", cmdl_force_cpei);
180
154static int __init 181static int __init
155nointroute (char *str) 182nointroute (char *str)
156{ 183{
@@ -161,6 +188,27 @@ nointroute (char *str)
161 188
162__setup("nointroute", nointroute); 189__setup("nointroute", nointroute);
163 190
191static void fix_b0_for_bsp(void)
192{
193#ifdef CONFIG_HOTPLUG_CPU
194 int cpuid;
195 static int fix_bsp_b0 = 1;
196
197 cpuid = smp_processor_id();
198
199 /*
200 * Cache the b0 value on the first AP that comes up
201 */
202 if (!(fix_bsp_b0 && cpuid))
203 return;
204
205 sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0];
206 printk ("Fixed BSP b0 value from CPU %d\n", cpuid);
207
208 fix_bsp_b0 = 0;
209#endif
210}
211
164void 212void
165sync_master (void *arg) 213sync_master (void *arg)
166{ 214{
@@ -327,8 +375,9 @@ smp_setup_percpu_timer (void)
327static void __devinit 375static void __devinit
328smp_callin (void) 376smp_callin (void)
329{ 377{
330 int cpuid, phys_id; 378 int cpuid, phys_id, itc_master;
331 extern void ia64_init_itm(void); 379 extern void ia64_init_itm(void);
380 extern volatile int time_keeper_id;
332 381
333#ifdef CONFIG_PERFMON 382#ifdef CONFIG_PERFMON
334 extern void pfm_init_percpu(void); 383 extern void pfm_init_percpu(void);
@@ -336,6 +385,7 @@ smp_callin (void)
336 385
337 cpuid = smp_processor_id(); 386 cpuid = smp_processor_id();
338 phys_id = hard_smp_processor_id(); 387 phys_id = hard_smp_processor_id();
388 itc_master = time_keeper_id;
339 389
340 if (cpu_online(cpuid)) { 390 if (cpu_online(cpuid)) {
341 printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n", 391 printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
@@ -343,6 +393,8 @@ smp_callin (void)
343 BUG(); 393 BUG();
344 } 394 }
345 395
396 fix_b0_for_bsp();
397
346 lock_ipi_calllock(); 398 lock_ipi_calllock();
347 cpu_set(cpuid, cpu_online_map); 399 cpu_set(cpuid, cpu_online_map);
348 unlock_ipi_calllock(); 400 unlock_ipi_calllock();
@@ -365,8 +417,8 @@ smp_callin (void)
365 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls 417 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
366 * local_bh_enable(), which bugs out if irqs are not enabled... 418 * local_bh_enable(), which bugs out if irqs are not enabled...
367 */ 419 */
368 Dprintk("Going to syncup ITC with BP.\n"); 420 Dprintk("Going to syncup ITC with ITC Master.\n");
369 ia64_sync_itc(0); 421 ia64_sync_itc(itc_master);
370 } 422 }
371 423
372 /* 424 /*
@@ -635,6 +687,47 @@ remove_siblinginfo(int cpu)
635} 687}
636 688
637extern void fixup_irqs(void); 689extern void fixup_irqs(void);
690
691int migrate_platform_irqs(unsigned int cpu)
692{
693 int new_cpei_cpu;
694 irq_desc_t *desc = NULL;
695 cpumask_t mask;
696 int retval = 0;
697
698 /*
699 * dont permit CPEI target to removed.
700 */
701 if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) {
702 printk ("CPU (%d) is CPEI Target\n", cpu);
703 if (can_cpei_retarget()) {
704 /*
705 * Now re-target the CPEI to a different processor
706 */
707 new_cpei_cpu = any_online_cpu(cpu_online_map);
708 mask = cpumask_of_cpu(new_cpei_cpu);
709 set_cpei_target_cpu(new_cpei_cpu);
710 desc = irq_descp(ia64_cpe_irq);
711 /*
712 * Switch for now, immediatly, we need to do fake intr
713 * as other interrupts, but need to study CPEI behaviour with
714 * polling before making changes.
715 */
716 if (desc) {
717 desc->handler->disable(ia64_cpe_irq);
718 desc->handler->set_affinity(ia64_cpe_irq, mask);
719 desc->handler->enable(ia64_cpe_irq);
720 printk ("Re-targetting CPEI to cpu %d\n", new_cpei_cpu);
721 }
722 }
723 if (!desc) {
724 printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu);
725 retval = -EBUSY;
726 }
727 }
728 return retval;
729}
730
638/* must be called with cpucontrol mutex held */ 731/* must be called with cpucontrol mutex held */
639int __cpu_disable(void) 732int __cpu_disable(void)
640{ 733{
@@ -643,8 +736,17 @@ int __cpu_disable(void)
643 /* 736 /*
644 * dont permit boot processor for now 737 * dont permit boot processor for now
645 */ 738 */
646 if (cpu == 0) 739 if (cpu == 0 && !bsp_remove_ok) {
647 return -EBUSY; 740 printk ("Your platform does not support removal of BSP\n");
741 return (-EBUSY);
742 }
743
744 cpu_clear(cpu, cpu_online_map);
745
746 if (migrate_platform_irqs(cpu)) {
747 cpu_set(cpu, cpu_online_map);
748 return (-EBUSY);
749 }
648 750
649 remove_siblinginfo(cpu); 751 remove_siblinginfo(cpu);
650 cpu_clear(cpu, cpu_online_map); 752 cpu_clear(cpu, cpu_online_map);
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 307d01e15b2e..ac167436e936 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -32,7 +32,7 @@
32 32
33extern unsigned long wall_jiffies; 33extern unsigned long wall_jiffies;
34 34
35#define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */ 35volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
36 36
37#ifdef CONFIG_IA64_DEBUG_IRQ 37#ifdef CONFIG_IA64_DEBUG_IRQ
38 38
@@ -71,7 +71,7 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
71 71
72 new_itm += local_cpu_data->itm_delta; 72 new_itm += local_cpu_data->itm_delta;
73 73
74 if (smp_processor_id() == TIME_KEEPER_ID) { 74 if (smp_processor_id() == time_keeper_id) {
75 /* 75 /*
76 * Here we are in the timer irq handler. We have irqs locally 76 * Here we are in the timer irq handler. We have irqs locally
77 * disabled, but we don't know if the timer_bh is running on 77 * disabled, but we don't know if the timer_bh is running on
@@ -236,6 +236,11 @@ static struct irqaction timer_irqaction = {
236 .name = "timer" 236 .name = "timer"
237}; 237};
238 238
239void __devinit ia64_disable_timer(void)
240{
241 ia64_set_itv(1 << 16);
242}
243
239void __init 244void __init
240time_init (void) 245time_init (void)
241{ 246{
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 6e5eea19fa67..3b6fd798c4d6 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -36,7 +36,7 @@ int arch_register_cpu(int num)
36 parent = &sysfs_nodes[cpu_to_node(num)]; 36 parent = &sysfs_nodes[cpu_to_node(num)];
37#endif /* CONFIG_NUMA */ 37#endif /* CONFIG_NUMA */
38 38
39#ifdef CONFIG_ACPI 39#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
40 /* 40 /*
41 * If CPEI cannot be re-targetted, and this is 41 * If CPEI cannot be re-targetted, and this is
42 * CPEI target, then dont create the control file 42 * CPEI target, then dont create the control file
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index acaaec4e4681..9855ba318094 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -181,13 +181,15 @@ per_cpu_init (void)
181{ 181{
182 void *cpu_data; 182 void *cpu_data;
183 int cpu; 183 int cpu;
184 static int first_time=1;
184 185
185 /* 186 /*
186 * get_free_pages() cannot be used before cpu_init() done. BSP 187 * get_free_pages() cannot be used before cpu_init() done. BSP
187 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls 188 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
188 * get_zeroed_page(). 189 * get_zeroed_page().
189 */ 190 */
190 if (smp_processor_id() == 0) { 191 if (first_time) {
192 first_time=0;
191 cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, 193 cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
192 PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 194 PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
193 for (cpu = 0; cpu < NR_CPUS; cpu++) { 195 for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c87d6d1d5813..573d5cc63e2b 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -528,12 +528,17 @@ void __init find_memory(void)
528void *per_cpu_init(void) 528void *per_cpu_init(void)
529{ 529{
530 int cpu; 530 int cpu;
531 static int first_time = 1;
532
531 533
532 if (smp_processor_id() != 0) 534 if (smp_processor_id() != 0)
533 return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; 535 return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
534 536
535 for (cpu = 0; cpu < NR_CPUS; cpu++) 537 if (first_time) {
536 per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; 538 first_time = 0;
539 for (cpu = 0; cpu < NR_CPUS; cpu++)
540 per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
541 }
537 542
538 return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; 543 return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
539} 544}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 2d13889d0a99..9dbc7dadd165 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
68#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } 68#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
69 69
70/* 70/*
71 * This function checks for proper alignment of input addr and len parameters. 71 * Don't actually need to do any preparation, but need to make sure
72 * the address is in the right region.
72 */ 73 */
73int is_aligned_hugepage_range(unsigned long addr, unsigned long len) 74int prepare_hugepage_range(unsigned long addr, unsigned long len)
74{ 75{
75 if (len & ~HPAGE_MASK) 76 if (len & ~HPAGE_MASK)
76 return -EINVAL; 77 return -EINVAL;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b38b6d213c15..08d94e6bfa18 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -197,7 +197,7 @@ free_initmem (void)
197 eaddr = (unsigned long) ia64_imva(__init_end); 197 eaddr = (unsigned long) ia64_imva(__init_end);
198 while (addr < eaddr) { 198 while (addr < eaddr) {
199 ClearPageReserved(virt_to_page(addr)); 199 ClearPageReserved(virt_to_page(addr));
200 set_page_count(virt_to_page(addr), 1); 200 init_page_count(virt_to_page(addr));
201 free_page(addr); 201 free_page(addr);
202 ++totalram_pages; 202 ++totalram_pages;
203 addr += PAGE_SIZE; 203 addr += PAGE_SIZE;
@@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end)
252 continue; 252 continue;
253 page = virt_to_page(start); 253 page = virt_to_page(start);
254 ClearPageReserved(page); 254 ClearPageReserved(page);
255 set_page_count(page, 1); 255 init_page_count(page);
256 free_page(start); 256 free_page(start);
257 ++totalram_pages; 257 ++totalram_pages;
258 } 258 }
@@ -640,7 +640,7 @@ mem_init (void)
640void online_page(struct page *page) 640void online_page(struct page *page)
641{ 641{
642 ClearPageReserved(page); 642 ClearPageReserved(page);
643 set_page_count(page, 1); 643 init_page_count(page);
644 __free_page(page); 644 __free_page(page);
645 totalram_pages++; 645 totalram_pages++;
646 num_physpages++; 646 num_physpages++;
diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile
index 3e9b4eea7418..ab9c48c88012 100644
--- a/arch/ia64/sn/kernel/Makefile
+++ b/arch/ia64/sn/kernel/Makefile
@@ -10,7 +10,8 @@
10CPPFLAGS += -I$(srctree)/arch/ia64/sn/include 10CPPFLAGS += -I$(srctree)/arch/ia64/sn/include
11 11
12obj-y += setup.o bte.o bte_error.o irq.o mca.o idle.o \ 12obj-y += setup.o bte.o bte_error.o irq.o mca.o idle.o \
13 huberror.o io_init.o iomv.o klconflib.o sn2/ 13 huberror.o io_init.o iomv.o klconflib.o pio_phys.o \
14 sn2/
14obj-$(CONFIG_IA64_GENERIC) += machvec.o 15obj-$(CONFIG_IA64_GENERIC) += machvec.o
15obj-$(CONFIG_SGI_TIOCX) += tiocx.o 16obj-$(CONFIG_SGI_TIOCX) += tiocx.o
16obj-$(CONFIG_IA64_SGI_SN_XP) += xp.o 17obj-$(CONFIG_IA64_SGI_SN_XP) += xp.o
diff --git a/arch/ia64/sn/kernel/pio_phys.S b/arch/ia64/sn/kernel/pio_phys.S
new file mode 100644
index 000000000000..3c7d48d6ecb8
--- /dev/null
+++ b/arch/ia64/sn/kernel/pio_phys.S
@@ -0,0 +1,71 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
7 *
8 * This file contains macros used to access MMR registers via
9 * uncached physical addresses.
10 * pio_phys_read_mmr - read an MMR
11 * pio_phys_write_mmr - write an MMR
12 * pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0
13 * Second MMR will be skipped if address is NULL
14 *
15 * Addresses passed to these routines should be uncached physical addresses
16 * ie., 0x80000....
17 */
18
19
20
21#include <asm/asmmacro.h>
22#include <asm/page.h>
23
24GLOBAL_ENTRY(pio_phys_read_mmr)
25 .prologue
26 .regstk 1,0,0,0
27 .body
28 mov r2=psr
29 rsm psr.i | psr.dt
30 ;;
31 srlz.d
32 ld8.acq r8=[r32]
33 ;;
34 mov psr.l=r2;;
35 srlz.d
36 br.ret.sptk.many rp
37END(pio_phys_read_mmr)
38
39GLOBAL_ENTRY(pio_phys_write_mmr)
40 .prologue
41 .regstk 2,0,0,0
42 .body
43 mov r2=psr
44 rsm psr.i | psr.dt
45 ;;
46 srlz.d
47 st8.rel [r32]=r33
48 ;;
49 mov psr.l=r2;;
50 srlz.d
51 br.ret.sptk.many rp
52END(pio_phys_write_mmr)
53
54GLOBAL_ENTRY(pio_atomic_phys_write_mmrs)
55 .prologue
56 .regstk 4,0,0,0
57 .body
58 mov r2=psr
59 cmp.ne p9,p0=r34,r0;
60 rsm psr.i | psr.dt | psr.ic
61 ;;
62 srlz.d
63 st8.rel [r32]=r33
64(p9) st8.rel [r34]=r35
65 ;;
66 mov psr.l=r2;;
67 srlz.d
68 br.ret.sptk.many rp
69END(pio_atomic_phys_write_mmrs)
70
71
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 5b84836c2171..8b6d5c844708 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -3,7 +3,7 @@
3 * License. See the file "COPYING" in the main directory of this archive 3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details. 4 * for more details.
5 * 5 *
6 * Copyright (C) 1999,2001-2005 Silicon Graphics, Inc. All rights reserved. 6 * Copyright (C) 1999,2001-2006 Silicon Graphics, Inc. All rights reserved.
7 */ 7 */
8 8
9#include <linux/config.h> 9#include <linux/config.h>
@@ -498,6 +498,7 @@ void __init sn_setup(char **cmdline_p)
498 * for sn. 498 * for sn.
499 */ 499 */
500 pm_power_off = ia64_sn_power_down; 500 pm_power_off = ia64_sn_power_down;
501 current->thread.flags |= IA64_THREAD_MIGRATION;
501} 502}
502 503
503/** 504/**
@@ -660,7 +661,8 @@ void __init sn_cpu_init(void)
660 SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3}; 661 SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3};
661 u64 *pio; 662 u64 *pio;
662 pio = is_shub1() ? pio1 : pio2; 663 pio = is_shub1() ? pio1 : pio2;
663 pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]); 664 pda->pio_write_status_addr =
665 (volatile unsigned long *)GLOBAL_MMR_ADDR(nasid, pio[slice]);
664 pda->pio_write_status_val = is_shub1() ? SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK : 0; 666 pda->pio_write_status_val = is_shub1() ? SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK : 0;
665 } 667 }
666 668
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index b2e1e746b47f..d9d306c79f2d 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -93,6 +93,27 @@ static inline unsigned long wait_piowc(void)
93 return (ws & SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK) != 0; 93 return (ws & SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK) != 0;
94} 94}
95 95
96/**
97 * sn_migrate - SN-specific task migration actions
98 * @task: Task being migrated to new CPU
99 *
100 * SN2 PIO writes from separate CPUs are not guaranteed to arrive in order.
101 * Context switching user threads which have memory-mapped MMIO may cause
102 * PIOs to issue from seperate CPUs, thus the PIO writes must be drained
103 * from the previous CPU's Shub before execution resumes on the new CPU.
104 */
105void sn_migrate(struct task_struct *task)
106{
107 pda_t *last_pda = pdacpu(task_thread_info(task)->last_cpu);
108 volatile unsigned long *adr = last_pda->pio_write_status_addr;
109 unsigned long val = last_pda->pio_write_status_val;
110
111 /* Drain PIO writes from old CPU's Shub */
112 while (unlikely((*adr & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK)
113 != val))
114 cpu_relax();
115}
116
96void sn_tlb_migrate_finish(struct mm_struct *mm) 117void sn_tlb_migrate_finish(struct mm_struct *mm)
97{ 118{
98 /* flush_tlb_mm is inefficient if more than 1 users of mm */ 119 /* flush_tlb_mm is inefficient if more than 1 users of mm */
diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c
index cdf6856ce089..d0abddd9ffe6 100644
--- a/arch/ia64/sn/kernel/xpc_channel.c
+++ b/arch/ia64/sn/kernel/xpc_channel.c
@@ -21,7 +21,6 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/cache.h> 22#include <linux/cache.h>
23#include <linux/interrupt.h> 23#include <linux/interrupt.h>
24#include <linux/slab.h>
25#include <linux/mutex.h> 24#include <linux/mutex.h>
26#include <linux/completion.h> 25#include <linux/completion.h>
27#include <asm/sn/bte.h> 26#include <asm/sn/bte.h>
@@ -30,6 +29,31 @@
30 29
31 30
32/* 31/*
32 * Guarantee that the kzalloc'd memory is cacheline aligned.
33 */
34static void *
35xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
36{
37 /* see if kzalloc will give us cachline aligned memory by default */
38 *base = kzalloc(size, flags);
39 if (*base == NULL) {
40 return NULL;
41 }
42 if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
43 return *base;
44 }
45 kfree(*base);
46
47 /* nope, we'll have to do it ourselves */
48 *base = kzalloc(size + L1_CACHE_BYTES, flags);
49 if (*base == NULL) {
50 return NULL;
51 }
52 return (void *) L1_CACHE_ALIGN((u64) *base);
53}
54
55
56/*
33 * Set up the initial values for the XPartition Communication channels. 57 * Set up the initial values for the XPartition Communication channels.
34 */ 58 */
35static void 59static void
@@ -93,20 +117,19 @@ xpc_setup_infrastructure(struct xpc_partition *part)
93 * Allocate all of the channel structures as a contiguous chunk of 117 * Allocate all of the channel structures as a contiguous chunk of
94 * memory. 118 * memory.
95 */ 119 */
96 part->channels = kmalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS, 120 part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
97 GFP_KERNEL); 121 GFP_KERNEL);
98 if (part->channels == NULL) { 122 if (part->channels == NULL) {
99 dev_err(xpc_chan, "can't get memory for channels\n"); 123 dev_err(xpc_chan, "can't get memory for channels\n");
100 return xpcNoMemory; 124 return xpcNoMemory;
101 } 125 }
102 memset(part->channels, 0, sizeof(struct xpc_channel) * XPC_NCHANNELS);
103 126
104 part->nchannels = XPC_NCHANNELS; 127 part->nchannels = XPC_NCHANNELS;
105 128
106 129
107 /* allocate all the required GET/PUT values */ 130 /* allocate all the required GET/PUT values */
108 131
109 part->local_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE, 132 part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
110 GFP_KERNEL, &part->local_GPs_base); 133 GFP_KERNEL, &part->local_GPs_base);
111 if (part->local_GPs == NULL) { 134 if (part->local_GPs == NULL) {
112 kfree(part->channels); 135 kfree(part->channels);
@@ -115,55 +138,51 @@ xpc_setup_infrastructure(struct xpc_partition *part)
115 "values\n"); 138 "values\n");
116 return xpcNoMemory; 139 return xpcNoMemory;
117 } 140 }
118 memset(part->local_GPs, 0, XPC_GP_SIZE);
119 141
120 part->remote_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE, 142 part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
121 GFP_KERNEL, &part->remote_GPs_base); 143 GFP_KERNEL, &part->remote_GPs_base);
122 if (part->remote_GPs == NULL) { 144 if (part->remote_GPs == NULL) {
123 kfree(part->channels);
124 part->channels = NULL;
125 kfree(part->local_GPs_base);
126 part->local_GPs = NULL;
127 dev_err(xpc_chan, "can't get memory for remote get/put " 145 dev_err(xpc_chan, "can't get memory for remote get/put "
128 "values\n"); 146 "values\n");
147 kfree(part->local_GPs_base);
148 part->local_GPs = NULL;
149 kfree(part->channels);
150 part->channels = NULL;
129 return xpcNoMemory; 151 return xpcNoMemory;
130 } 152 }
131 memset(part->remote_GPs, 0, XPC_GP_SIZE);
132 153
133 154
134 /* allocate all the required open and close args */ 155 /* allocate all the required open and close args */
135 156
136 part->local_openclose_args = xpc_kmalloc_cacheline_aligned( 157 part->local_openclose_args = xpc_kzalloc_cacheline_aligned(
137 XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL, 158 XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
138 &part->local_openclose_args_base); 159 &part->local_openclose_args_base);
139 if (part->local_openclose_args == NULL) { 160 if (part->local_openclose_args == NULL) {
140 kfree(part->channels); 161 dev_err(xpc_chan, "can't get memory for local connect args\n");
141 part->channels = NULL;
142 kfree(part->local_GPs_base);
143 part->local_GPs = NULL;
144 kfree(part->remote_GPs_base); 162 kfree(part->remote_GPs_base);
145 part->remote_GPs = NULL; 163 part->remote_GPs = NULL;
146 dev_err(xpc_chan, "can't get memory for local connect args\n"); 164 kfree(part->local_GPs_base);
165 part->local_GPs = NULL;
166 kfree(part->channels);
167 part->channels = NULL;
147 return xpcNoMemory; 168 return xpcNoMemory;
148 } 169 }
149 memset(part->local_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE);
150 170
151 part->remote_openclose_args = xpc_kmalloc_cacheline_aligned( 171 part->remote_openclose_args = xpc_kzalloc_cacheline_aligned(
152 XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL, 172 XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
153 &part->remote_openclose_args_base); 173 &part->remote_openclose_args_base);
154 if (part->remote_openclose_args == NULL) { 174 if (part->remote_openclose_args == NULL) {
155 kfree(part->channels); 175 dev_err(xpc_chan, "can't get memory for remote connect args\n");
156 part->channels = NULL;
157 kfree(part->local_GPs_base);
158 part->local_GPs = NULL;
159 kfree(part->remote_GPs_base);
160 part->remote_GPs = NULL;
161 kfree(part->local_openclose_args_base); 176 kfree(part->local_openclose_args_base);
162 part->local_openclose_args = NULL; 177 part->local_openclose_args = NULL;
163 dev_err(xpc_chan, "can't get memory for remote connect args\n"); 178 kfree(part->remote_GPs_base);
179 part->remote_GPs = NULL;
180 kfree(part->local_GPs_base);
181 part->local_GPs = NULL;
182 kfree(part->channels);
183 part->channels = NULL;
164 return xpcNoMemory; 184 return xpcNoMemory;
165 } 185 }
166 memset(part->remote_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE);
167 186
168 187
169 xpc_initialize_channels(part, partid); 188 xpc_initialize_channels(part, partid);
@@ -186,18 +205,18 @@ xpc_setup_infrastructure(struct xpc_partition *part)
186 ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, SA_SHIRQ, 205 ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, SA_SHIRQ,
187 part->IPI_owner, (void *) (u64) partid); 206 part->IPI_owner, (void *) (u64) partid);
188 if (ret != 0) { 207 if (ret != 0) {
189 kfree(part->channels);
190 part->channels = NULL;
191 kfree(part->local_GPs_base);
192 part->local_GPs = NULL;
193 kfree(part->remote_GPs_base);
194 part->remote_GPs = NULL;
195 kfree(part->local_openclose_args_base);
196 part->local_openclose_args = NULL;
197 kfree(part->remote_openclose_args_base);
198 part->remote_openclose_args = NULL;
199 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, " 208 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
200 "errno=%d\n", -ret); 209 "errno=%d\n", -ret);
210 kfree(part->remote_openclose_args_base);
211 part->remote_openclose_args = NULL;
212 kfree(part->local_openclose_args_base);
213 part->local_openclose_args = NULL;
214 kfree(part->remote_GPs_base);
215 part->remote_GPs = NULL;
216 kfree(part->local_GPs_base);
217 part->local_GPs = NULL;
218 kfree(part->channels);
219 part->channels = NULL;
201 return xpcLackOfResources; 220 return xpcLackOfResources;
202 } 221 }
203 222
@@ -446,22 +465,20 @@ xpc_allocate_local_msgqueue(struct xpc_channel *ch)
446 for (nentries = ch->local_nentries; nentries > 0; nentries--) { 465 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
447 466
448 nbytes = nentries * ch->msg_size; 467 nbytes = nentries * ch->msg_size;
449 ch->local_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes, 468 ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
450 GFP_KERNEL, 469 GFP_KERNEL,
451 &ch->local_msgqueue_base); 470 &ch->local_msgqueue_base);
452 if (ch->local_msgqueue == NULL) { 471 if (ch->local_msgqueue == NULL) {
453 continue; 472 continue;
454 } 473 }
455 memset(ch->local_msgqueue, 0, nbytes);
456 474
457 nbytes = nentries * sizeof(struct xpc_notify); 475 nbytes = nentries * sizeof(struct xpc_notify);
458 ch->notify_queue = kmalloc(nbytes, GFP_KERNEL); 476 ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
459 if (ch->notify_queue == NULL) { 477 if (ch->notify_queue == NULL) {
460 kfree(ch->local_msgqueue_base); 478 kfree(ch->local_msgqueue_base);
461 ch->local_msgqueue = NULL; 479 ch->local_msgqueue = NULL;
462 continue; 480 continue;
463 } 481 }
464 memset(ch->notify_queue, 0, nbytes);
465 482
466 spin_lock_irqsave(&ch->lock, irq_flags); 483 spin_lock_irqsave(&ch->lock, irq_flags);
467 if (nentries < ch->local_nentries) { 484 if (nentries < ch->local_nentries) {
@@ -501,13 +518,12 @@ xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
501 for (nentries = ch->remote_nentries; nentries > 0; nentries--) { 518 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
502 519
503 nbytes = nentries * ch->msg_size; 520 nbytes = nentries * ch->msg_size;
504 ch->remote_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes, 521 ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
505 GFP_KERNEL, 522 GFP_KERNEL,
506 &ch->remote_msgqueue_base); 523 &ch->remote_msgqueue_base);
507 if (ch->remote_msgqueue == NULL) { 524 if (ch->remote_msgqueue == NULL) {
508 continue; 525 continue;
509 } 526 }
510 memset(ch->remote_msgqueue, 0, nbytes);
511 527
512 spin_lock_irqsave(&ch->lock, irq_flags); 528 spin_lock_irqsave(&ch->lock, irq_flags);
513 if (nentries < ch->remote_nentries) { 529 if (nentries < ch->remote_nentries) {
diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c
index 8cbf16432570..99b123a6421a 100644
--- a/arch/ia64/sn/kernel/xpc_main.c
+++ b/arch/ia64/sn/kernel/xpc_main.c
@@ -52,7 +52,6 @@
52#include <linux/syscalls.h> 52#include <linux/syscalls.h>
53#include <linux/cache.h> 53#include <linux/cache.h>
54#include <linux/interrupt.h> 54#include <linux/interrupt.h>
55#include <linux/slab.h>
56#include <linux/delay.h> 55#include <linux/delay.h>
57#include <linux/reboot.h> 56#include <linux/reboot.h>
58#include <linux/completion.h> 57#include <linux/completion.h>
diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c
index 88a730e6cfdb..94211429fd0c 100644
--- a/arch/ia64/sn/kernel/xpc_partition.c
+++ b/arch/ia64/sn/kernel/xpc_partition.c
@@ -81,6 +81,31 @@ char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE +
81 81
82 82
83/* 83/*
84 * Guarantee that the kmalloc'd memory is cacheline aligned.
85 */
86static void *
87xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
88{
89 /* see if kmalloc will give us cachline aligned memory by default */
90 *base = kmalloc(size, flags);
91 if (*base == NULL) {
92 return NULL;
93 }
94 if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
95 return *base;
96 }
97 kfree(*base);
98
99 /* nope, we'll have to do it ourselves */
100 *base = kmalloc(size + L1_CACHE_BYTES, flags);
101 if (*base == NULL) {
102 return NULL;
103 }
104 return (void *) L1_CACHE_ALIGN((u64) *base);
105}
106
107
108/*
84 * Given a nasid, get the physical address of the partition's reserved page 109 * Given a nasid, get the physical address of the partition's reserved page
85 * for that nasid. This function returns 0 on any error. 110 * for that nasid. This function returns 0 on any error.
86 */ 111 */
@@ -1038,13 +1063,12 @@ xpc_discovery(void)
1038 remote_vars = (struct xpc_vars *) remote_rp; 1063 remote_vars = (struct xpc_vars *) remote_rp;
1039 1064
1040 1065
1041 discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words, 1066 discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
1042 GFP_KERNEL); 1067 GFP_KERNEL);
1043 if (discovered_nasids == NULL) { 1068 if (discovered_nasids == NULL) {
1044 kfree(remote_rp_base); 1069 kfree(remote_rp_base);
1045 return; 1070 return;
1046 } 1071 }
1047 memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words);
1048 1072
1049 rp = (struct xpc_rsvd_page *) xpc_rsvd_page; 1073 rp = (struct xpc_rsvd_page *) xpc_rsvd_page;
1050 1074
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index e52831ed93eb..fa073cc4b565 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -15,6 +15,124 @@
15#include <asm/sn/pcidev.h> 15#include <asm/sn/pcidev.h>
16#include <asm/sn/pcibus_provider_defs.h> 16#include <asm/sn/pcibus_provider_defs.h>
17#include <asm/sn/tioce_provider.h> 17#include <asm/sn/tioce_provider.h>
18#include <asm/sn/sn2/sn_hwperf.h>
19
20/*
21 * 1/26/2006
22 *
23 * WAR for SGI PV 944642. For revA TIOCE, need to use the following recipe
24 * (taken from the above PV) before and after accessing tioce internal MMR's
25 * to avoid tioce lockups.
26 *
27 * The recipe as taken from the PV:
28 *
29 * if(mmr address < 0x45000) {
30 * if(mmr address == 0 or 0x80)
31 * mmr wrt or read address 0xc0
32 * else if(mmr address == 0x148 or 0x200)
33 * mmr wrt or read address 0x28
34 * else
35 * mmr wrt or read address 0x158
36 *
37 * do desired mmr access (rd or wrt)
38 *
39 * if(mmr address == 0x100)
40 * mmr wrt or read address 0x38
41 * mmr wrt or read address 0xb050
42 * } else
43 * do desired mmr access
44 *
45 * According to hw, we can use reads instead of writes to the above addres
46 *
47 * Note this WAR can only to be used for accessing internal MMR's in the
48 * TIOCE Coretalk Address Range 0x0 - 0x07ff_ffff. This includes the
49 * "Local CE Registers and Memories" and "PCI Compatible Config Space" address
50 * spaces from table 2-1 of the "CE Programmer's Reference Overview" document.
51 *
52 * All registers defined in struct tioce will meet that criteria.
53 */
54
55static void inline
56tioce_mmr_war_pre(struct tioce_kernel *kern, void *mmr_addr)
57{
58 u64 mmr_base;
59 u64 mmr_offset;
60
61 if (kern->ce_common->ce_rev != TIOCE_REV_A)
62 return;
63
64 mmr_base = kern->ce_common->ce_pcibus.bs_base;
65 mmr_offset = (u64)mmr_addr - mmr_base;
66
67 if (mmr_offset < 0x45000) {
68 u64 mmr_war_offset;
69
70 if (mmr_offset == 0 || mmr_offset == 0x80)
71 mmr_war_offset = 0xc0;
72 else if (mmr_offset == 0x148 || mmr_offset == 0x200)
73 mmr_war_offset = 0x28;
74 else
75 mmr_war_offset = 0x158;
76
77 readq_relaxed((void *)(mmr_base + mmr_war_offset));
78 }
79}
80
81static void inline
82tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr)
83{
84 u64 mmr_base;
85 u64 mmr_offset;
86
87 if (kern->ce_common->ce_rev != TIOCE_REV_A)
88 return;
89
90 mmr_base = kern->ce_common->ce_pcibus.bs_base;
91 mmr_offset = (u64)mmr_addr - mmr_base;
92
93 if (mmr_offset < 0x45000) {
94 if (mmr_offset == 0x100)
95 readq_relaxed((void *)(mmr_base + 0x38));
96 readq_relaxed((void *)(mmr_base + 0xb050));
97 }
98}
99
100/* load mmr contents into a variable */
101#define tioce_mmr_load(kern, mmrp, varp) do {\
102 tioce_mmr_war_pre(kern, mmrp); \
103 *(varp) = readq_relaxed(mmrp); \
104 tioce_mmr_war_post(kern, mmrp); \
105} while (0)
106
107/* store variable contents into mmr */
108#define tioce_mmr_store(kern, mmrp, varp) do {\
109 tioce_mmr_war_pre(kern, mmrp); \
110 writeq(*varp, mmrp); \
111 tioce_mmr_war_post(kern, mmrp); \
112} while (0)
113
114/* store immediate value into mmr */
115#define tioce_mmr_storei(kern, mmrp, val) do {\
116 tioce_mmr_war_pre(kern, mmrp); \
117 writeq(val, mmrp); \
118 tioce_mmr_war_post(kern, mmrp); \
119} while (0)
120
121/* set bits (immediate value) into mmr */
122#define tioce_mmr_seti(kern, mmrp, bits) do {\
123 u64 tmp; \
124 tioce_mmr_load(kern, mmrp, &tmp); \
125 tmp |= (bits); \
126 tioce_mmr_store(kern, mmrp, &tmp); \
127} while (0)
128
129/* clear bits (immediate value) into mmr */
130#define tioce_mmr_clri(kern, mmrp, bits) do { \
131 u64 tmp; \
132 tioce_mmr_load(kern, mmrp, &tmp); \
133 tmp &= ~(bits); \
134 tioce_mmr_store(kern, mmrp, &tmp); \
135} while (0)
18 136
19/** 137/**
20 * Bus address ranges for the 5 flavors of TIOCE DMA 138 * Bus address ranges for the 5 flavors of TIOCE DMA
@@ -62,9 +180,9 @@
62#define TIOCE_ATE_M40 2 180#define TIOCE_ATE_M40 2
63#define TIOCE_ATE_M40S 3 181#define TIOCE_ATE_M40S 3
64 182
65#define KB(x) ((x) << 10) 183#define KB(x) ((u64)(x) << 10)
66#define MB(x) ((x) << 20) 184#define MB(x) ((u64)(x) << 20)
67#define GB(x) ((x) << 30) 185#define GB(x) ((u64)(x) << 30)
68 186
69/** 187/**
70 * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode 188 * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode
@@ -151,7 +269,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
151 int last; 269 int last;
152 int entries; 270 int entries;
153 int nates; 271 int nates;
154 int pagesize; 272 u64 pagesize;
155 u64 *ate_shadow; 273 u64 *ate_shadow;
156 u64 *ate_reg; 274 u64 *ate_reg;
157 u64 addr; 275 u64 addr;
@@ -228,7 +346,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
228 346
229 ate = ATE_MAKE(addr, pagesize); 347 ate = ATE_MAKE(addr, pagesize);
230 ate_shadow[i + j] = ate; 348 ate_shadow[i + j] = ate;
231 writeq(ate, &ate_reg[i + j]); 349 tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate);
232 addr += pagesize; 350 addr += pagesize;
233 } 351 }
234 352
@@ -272,7 +390,8 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr)
272 u64 tmp; 390 u64 tmp;
273 391
274 ce_kern->ce_port[port].dirmap_shadow = ct_upper; 392 ce_kern->ce_port[port].dirmap_shadow = ct_upper;
275 writeq(ct_upper, &ce_mmr->ce_ure_dir_map[port]); 393 tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port],
394 ct_upper);
276 tmp = ce_mmr->ce_ure_dir_map[port]; 395 tmp = ce_mmr->ce_ure_dir_map[port];
277 dma_ok = 1; 396 dma_ok = 1;
278 } else 397 } else
@@ -344,7 +463,8 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
344 if (TIOCE_D32_ADDR(bus_addr)) { 463 if (TIOCE_D32_ADDR(bus_addr)) {
345 if (--ce_kern->ce_port[port].dirmap_refcnt == 0) { 464 if (--ce_kern->ce_port[port].dirmap_refcnt == 0) {
346 ce_kern->ce_port[port].dirmap_shadow = 0; 465 ce_kern->ce_port[port].dirmap_shadow = 0;
347 writeq(0, &ce_mmr->ce_ure_dir_map[port]); 466 tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port],
467 0);
348 } 468 }
349 } else { 469 } else {
350 struct tioce_dmamap *map; 470 struct tioce_dmamap *map;
@@ -365,7 +485,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
365 } else if (--map->refcnt == 0) { 485 } else if (--map->refcnt == 0) {
366 for (i = 0; i < map->ate_count; i++) { 486 for (i = 0; i < map->ate_count; i++) {
367 map->ate_shadow[i] = 0; 487 map->ate_shadow[i] = 0;
368 map->ate_hw[i] = 0; 488 tioce_mmr_storei(ce_kern, &map->ate_hw[i], 0);
369 } 489 }
370 490
371 list_del(&map->ce_dmamap_list); 491 list_del(&map->ce_dmamap_list);
@@ -486,7 +606,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
486 spin_unlock_irqrestore(&ce_kern->ce_lock, flags); 606 spin_unlock_irqrestore(&ce_kern->ce_lock, flags);
487 607
488dma_map_done: 608dma_map_done:
489 if (mapaddr & barrier) 609 if (mapaddr && barrier)
490 mapaddr = tioce_dma_barrier(mapaddr, 1); 610 mapaddr = tioce_dma_barrier(mapaddr, 1);
491 611
492 return mapaddr; 612 return mapaddr;
@@ -541,17 +661,61 @@ tioce_error_intr_handler(int irq, void *arg, struct pt_regs *pt)
541 soft->ce_pcibus.bs_persist_segment, 661 soft->ce_pcibus.bs_persist_segment,
542 soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0); 662 soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0);
543 663
664 if (ret_stuff.v0)
665 panic("tioce_error_intr_handler: Fatal TIOCE error");
666
544 return IRQ_HANDLED; 667 return IRQ_HANDLED;
545} 668}
546 669
547/** 670/**
671 * tioce_reserve_m32 - reserve M32 ate's for the indicated address range
672 * @tioce_kernel: TIOCE context to reserve ate's for
673 * @base: starting bus address to reserve
674 * @limit: last bus address to reserve
675 *
676 * If base/limit falls within the range of bus space mapped through the
677 * M32 space, reserve the resources corresponding to the range.
678 */
679static void
680tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit)
681{
682 int ate_index, last_ate, ps;
683 struct tioce *ce_mmr;
684
685 if (!TIOCE_M32_ADDR(base))
686 return;
687
688 ce_mmr = (struct tioce *)ce_kern->ce_common->ce_pcibus.bs_base;
689 ps = ce_kern->ce_ate3240_pagesize;
690 ate_index = ATE_PAGE(base, ps);
691 last_ate = ate_index + ATE_NPAGES(base, limit-base+1, ps) - 1;
692
693 if (ate_index < 64)
694 ate_index = 64;
695
696 while (ate_index <= last_ate) {
697 u64 ate;
698
699 ate = ATE_MAKE(0xdeadbeef, ps);
700 ce_kern->ce_ate3240_shadow[ate_index] = ate;
701 tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index],
702 ate);
703 ate_index++;
704 }
705}
706
707/**
548 * tioce_kern_init - init kernel structures related to a given TIOCE 708 * tioce_kern_init - init kernel structures related to a given TIOCE
549 * @tioce_common: ptr to a cached tioce_common struct that originated in prom 709 * @tioce_common: ptr to a cached tioce_common struct that originated in prom
550 */ static struct tioce_kernel * 710 */
711static struct tioce_kernel *
551tioce_kern_init(struct tioce_common *tioce_common) 712tioce_kern_init(struct tioce_common *tioce_common)
552{ 713{
553 int i; 714 int i;
715 int ps;
716 int dev;
554 u32 tmp; 717 u32 tmp;
718 unsigned int seg, bus;
555 struct tioce *tioce_mmr; 719 struct tioce *tioce_mmr;
556 struct tioce_kernel *tioce_kern; 720 struct tioce_kernel *tioce_kern;
557 721
@@ -572,9 +736,10 @@ tioce_kern_init(struct tioce_common *tioce_common)
572 * here to use pci_read_config_xxx() so use the raw_pci_ops vector. 736 * here to use pci_read_config_xxx() so use the raw_pci_ops vector.
573 */ 737 */
574 738
575 raw_pci_ops->read(tioce_common->ce_pcibus.bs_persist_segment, 739 seg = tioce_common->ce_pcibus.bs_persist_segment;
576 tioce_common->ce_pcibus.bs_persist_busnum, 740 bus = tioce_common->ce_pcibus.bs_persist_busnum;
577 PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1, &tmp); 741
742 raw_pci_ops->read(seg, bus, PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1,&tmp);
578 tioce_kern->ce_port1_secondary = (u8) tmp; 743 tioce_kern->ce_port1_secondary = (u8) tmp;
579 744
580 /* 745 /*
@@ -583,18 +748,76 @@ tioce_kern_init(struct tioce_common *tioce_common)
583 */ 748 */
584 749
585 tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base; 750 tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
586 __sn_clrq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_PAGESIZE_MASK); 751 tioce_mmr_clri(tioce_kern, &tioce_mmr->ce_ure_page_map,
587 __sn_setq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_256K_PAGESIZE); 752 CE_URE_PAGESIZE_MASK);
588 tioce_kern->ce_ate3240_pagesize = KB(256); 753 tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_ure_page_map,
754 CE_URE_256K_PAGESIZE);
755 ps = tioce_kern->ce_ate3240_pagesize = KB(256);
589 756
590 for (i = 0; i < TIOCE_NUM_M40_ATES; i++) { 757 for (i = 0; i < TIOCE_NUM_M40_ATES; i++) {
591 tioce_kern->ce_ate40_shadow[i] = 0; 758 tioce_kern->ce_ate40_shadow[i] = 0;
592 writeq(0, &tioce_mmr->ce_ure_ate40[i]); 759 tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate40[i], 0);
593 } 760 }
594 761
595 for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) { 762 for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) {
596 tioce_kern->ce_ate3240_shadow[i] = 0; 763 tioce_kern->ce_ate3240_shadow[i] = 0;
597 writeq(0, &tioce_mmr->ce_ure_ate3240[i]); 764 tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate3240[i], 0);
765 }
766
767 /*
768 * Reserve ATE's corresponding to reserved address ranges. These
769 * include:
770 *
771 * Memory space covered by each PPB mem base/limit register
772 * Memory space covered by each PPB prefetch base/limit register
773 *
774 * These bus ranges are for pio (downstream) traffic only, and so
775 * cannot be used for DMA.
776 */
777
778 for (dev = 1; dev <= 2; dev++) {
779 u64 base, limit;
780
781 /* mem base/limit */
782
783 raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
784 PCI_MEMORY_BASE, 2, &tmp);
785 base = (u64)tmp << 16;
786
787 raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
788 PCI_MEMORY_LIMIT, 2, &tmp);
789 limit = (u64)tmp << 16;
790 limit |= 0xfffffUL;
791
792 if (base < limit)
793 tioce_reserve_m32(tioce_kern, base, limit);
794
795 /*
796 * prefetch mem base/limit. The tioce ppb's have 64-bit
797 * decoders, so read the upper portions w/o checking the
798 * attributes.
799 */
800
801 raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
802 PCI_PREF_MEMORY_BASE, 2, &tmp);
803 base = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16;
804
805 raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
806 PCI_PREF_BASE_UPPER32, 4, &tmp);
807 base |= (u64)tmp << 32;
808
809 raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
810 PCI_PREF_MEMORY_LIMIT, 2, &tmp);
811
812 limit = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16;
813 limit |= 0xfffffUL;
814
815 raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
816 PCI_PREF_LIMIT_UPPER32, 4, &tmp);
817 limit |= (u64)tmp << 32;
818
819 if ((base < limit) && TIOCE_M32_ADDR(base))
820 tioce_reserve_m32(tioce_kern, base, limit);
598 } 821 }
599 822
600 return tioce_kern; 823 return tioce_kern;
@@ -614,6 +837,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
614{ 837{
615 struct pcidev_info *pcidev_info; 838 struct pcidev_info *pcidev_info;
616 struct tioce_common *ce_common; 839 struct tioce_common *ce_common;
840 struct tioce_kernel *ce_kern;
617 struct tioce *ce_mmr; 841 struct tioce *ce_mmr;
618 u64 force_int_val; 842 u64 force_int_val;
619 843
@@ -629,6 +853,29 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
629 853
630 ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info; 854 ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
631 ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base; 855 ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
856 ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private;
857
858 /*
859 * TIOCE Rev A workaround (PV 945826), force an interrupt by writing
860 * the TIO_INTx register directly (1/26/2006)
861 */
862 if (ce_common->ce_rev == TIOCE_REV_A) {
863 u64 int_bit_mask = (1ULL << sn_irq_info->irq_int_bit);
864 u64 status;
865
866 tioce_mmr_load(ce_kern, &ce_mmr->ce_adm_int_status, &status);
867 if (status & int_bit_mask) {
868 u64 force_irq = (1 << 8) | sn_irq_info->irq_irq;
869 u64 ctalk = sn_irq_info->irq_xtalkaddr;
870 u64 nasid, offset;
871
872 nasid = (ctalk & CTALK_NASID_MASK) >> CTALK_NASID_SHFT;
873 offset = (ctalk & CTALK_NODE_OFFSET);
874 HUB_S(TIO_IOSPACE_ADDR(nasid, offset), force_irq);
875 }
876
877 return;
878 }
632 879
633 /* 880 /*
634 * irq_int_bit is originally set up by prom, and holds the interrupt 881 * irq_int_bit is originally set up by prom, and holds the interrupt
@@ -666,7 +913,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
666 default: 913 default:
667 return; 914 return;
668 } 915 }
669 writeq(force_int_val, &ce_mmr->ce_adm_force_int); 916 tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_force_int, force_int_val);
670} 917}
671 918
672/** 919/**
@@ -685,6 +932,7 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
685{ 932{
686 struct pcidev_info *pcidev_info; 933 struct pcidev_info *pcidev_info;
687 struct tioce_common *ce_common; 934 struct tioce_common *ce_common;
935 struct tioce_kernel *ce_kern;
688 struct tioce *ce_mmr; 936 struct tioce *ce_mmr;
689 int bit; 937 int bit;
690 u64 vector; 938 u64 vector;
@@ -695,14 +943,15 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
695 943
696 ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info; 944 ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
697 ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base; 945 ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
946 ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private;
698 947
699 bit = sn_irq_info->irq_int_bit; 948 bit = sn_irq_info->irq_int_bit;
700 949
701 __sn_setq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit)); 950 tioce_mmr_seti(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit));
702 vector = (u64)sn_irq_info->irq_irq << INTR_VECTOR_SHFT; 951 vector = (u64)sn_irq_info->irq_irq << INTR_VECTOR_SHFT;
703 vector |= sn_irq_info->irq_xtalkaddr; 952 vector |= sn_irq_info->irq_xtalkaddr;
704 writeq(vector, &ce_mmr->ce_adm_int_dest[bit]); 953 tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_int_dest[bit], vector);
705 __sn_clrq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit)); 954 tioce_mmr_clri(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit));
706 955
707 tioce_force_interrupt(sn_irq_info); 956 tioce_force_interrupt(sn_irq_info);
708} 957}
@@ -721,7 +970,11 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
721static void * 970static void *
722tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller) 971tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller)
723{ 972{
973 int my_nasid;
974 cnodeid_t my_cnode, mem_cnode;
724 struct tioce_common *tioce_common; 975 struct tioce_common *tioce_common;
976 struct tioce_kernel *tioce_kern;
977 struct tioce *tioce_mmr;
725 978
726 /* 979 /*
727 * Allocate kernel bus soft and copy from prom. 980 * Allocate kernel bus soft and copy from prom.
@@ -734,11 +987,23 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
734 memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common)); 987 memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common));
735 tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET; 988 tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET;
736 989
737 if (tioce_kern_init(tioce_common) == NULL) { 990 tioce_kern = tioce_kern_init(tioce_common);
991 if (tioce_kern == NULL) {
738 kfree(tioce_common); 992 kfree(tioce_common);
739 return NULL; 993 return NULL;
740 } 994 }
741 995
996 /*
997 * Clear out any transient errors before registering the error
998 * interrupt handler.
999 */
1000
1001 tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
1002 tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_int_status_alias, ~0ULL);
1003 tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_error_summary_alias,
1004 ~0ULL);
1005 tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, ~0ULL);
1006
742 if (request_irq(SGI_PCIASIC_ERROR, 1007 if (request_irq(SGI_PCIASIC_ERROR,
743 tioce_error_intr_handler, 1008 tioce_error_intr_handler,
744 SA_SHIRQ, "TIOCE error", (void *)tioce_common)) 1009 SA_SHIRQ, "TIOCE error", (void *)tioce_common))
@@ -750,6 +1015,21 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
750 tioce_common->ce_pcibus.bs_persist_segment, 1015 tioce_common->ce_pcibus.bs_persist_segment,
751 tioce_common->ce_pcibus.bs_persist_busnum); 1016 tioce_common->ce_pcibus.bs_persist_busnum);
752 1017
1018 /*
1019 * identify closest nasid for memory allocations
1020 */
1021
1022 my_nasid = NASID_GET(tioce_common->ce_pcibus.bs_base);
1023 my_cnode = nasid_to_cnodeid(my_nasid);
1024
1025 if (sn_hwperf_get_nearest_node(my_cnode, &mem_cnode, NULL) < 0) {
1026 printk(KERN_WARNING "tioce_bus_fixup: failed to find "
1027 "closest node with MEM to TIO node %d\n", my_cnode);
1028 mem_cnode = (cnodeid_t)-1; /* use any node */
1029 }
1030
1031 controller->node = mem_cnode;
1032
753 return tioce_common; 1033 return tioce_common;
754} 1034}
755 1035
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 6facf15b04f3..c9e7dad860b7 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -226,7 +226,7 @@ void free_initmem(void)
226 addr = (unsigned long)(&__init_begin); 226 addr = (unsigned long)(&__init_begin);
227 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 227 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
228 ClearPageReserved(virt_to_page(addr)); 228 ClearPageReserved(virt_to_page(addr));
229 set_page_count(virt_to_page(addr), 1); 229 init_page_count(virt_to_page(addr));
230 free_page(addr); 230 free_page(addr);
231 totalram_pages++; 231 totalram_pages++;
232 } 232 }
@@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
244 unsigned long p; 244 unsigned long p;
245 for (p = start; p < end; p += PAGE_SIZE) { 245 for (p = start; p < end; p += PAGE_SIZE) {
246 ClearPageReserved(virt_to_page(p)); 246 ClearPageReserved(virt_to_page(p));
247 set_page_count(virt_to_page(p), 1); 247 init_page_count(virt_to_page(p));
248 free_page(p); 248 free_page(p);
249 totalram_pages++; 249 totalram_pages++;
250 } 250 }
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index c45beb955943..a190e39c907a 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
137 int pages = 0; 137 int pages = 0;
138 for (; start < end; start += PAGE_SIZE) { 138 for (; start < end; start += PAGE_SIZE) {
139 ClearPageReserved(virt_to_page(start)); 139 ClearPageReserved(virt_to_page(start));
140 set_page_count(virt_to_page(start), 1); 140 init_page_count(virt_to_page(start));
141 free_page(start); 141 free_page(start);
142 totalram_pages++; 142 totalram_pages++;
143 pages++; 143 pages++;
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index 559942ce0e1e..d6d582a5abb0 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable)
54 54
55 /* unreserve the page so it's possible to free that page */ 55 /* unreserve the page so it's possible to free that page */
56 PD_PAGE(dp)->flags &= ~(1 << PG_reserved); 56 PD_PAGE(dp)->flags &= ~(1 << PG_reserved);
57 set_page_count(PD_PAGE(dp), 1); 57 init_page_count(PD_PAGE(dp));
58 58
59 return; 59 return;
60} 60}
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index d855fec26317..afb57eeafdcb 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -276,7 +276,7 @@ void free_initmem(void)
276 addr = (unsigned long)&__init_begin; 276 addr = (unsigned long)&__init_begin;
277 for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) { 277 for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) {
278 virt_to_page(addr)->flags &= ~(1 << PG_reserved); 278 virt_to_page(addr)->flags &= ~(1 << PG_reserved);
279 set_page_count(virt_to_page(addr), 1); 279 init_page_count(virt_to_page(addr));
280 free_page(addr); 280 free_page(addr);
281 totalram_pages++; 281 totalram_pages++;
282 } 282 }
diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index eddb8d3e130a..d844c755945a 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__ioremap);
26EXPORT_SYMBOL(iounmap); 26EXPORT_SYMBOL(iounmap);
27EXPORT_SYMBOL(dump_fpu); 27EXPORT_SYMBOL(dump_fpu);
28EXPORT_SYMBOL(strnlen); 28EXPORT_SYMBOL(strnlen);
29EXPORT_SYMBOL(strpbrk);
29EXPORT_SYMBOL(strrchr); 30EXPORT_SYMBOL(strrchr);
30EXPORT_SYMBOL(strstr); 31EXPORT_SYMBOL(strstr);
31EXPORT_SYMBOL(strchr); 32EXPORT_SYMBOL(strchr);
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index 89f0b554ffb7..d79503fe6e42 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
195 int pages = 0; 195 int pages = 0;
196 for (; start < end; start += PAGE_SIZE) { 196 for (; start < end; start += PAGE_SIZE) {
197 ClearPageReserved(virt_to_page(start)); 197 ClearPageReserved(virt_to_page(start));
198 set_page_count(virt_to_page(start), 1); 198 init_page_count(virt_to_page(start));
199 free_page(start); 199 free_page(start);
200 totalram_pages++; 200 totalram_pages++;
201 pages++; 201 pages++;
@@ -218,7 +218,7 @@ free_initmem()
218 /* next to check that the page we free is not a partial page */ 218 /* next to check that the page we free is not a partial page */
219 for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { 219 for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
220 ClearPageReserved(virt_to_page(addr)); 220 ClearPageReserved(virt_to_page(addr));
221 set_page_count(virt_to_page(addr), 1); 221 init_page_count(virt_to_page(addr));
222 free_page(addr); 222 free_page(addr);
223 totalram_pages++; 223 totalram_pages++;
224 } 224 }
diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c
index 958d2eb78862..8a9ef58cc399 100644
--- a/arch/mips/arc/memory.c
+++ b/arch/mips/arc/memory.c
@@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void)
158 while (addr < boot_mem_map.map[i].addr 158 while (addr < boot_mem_map.map[i].addr
159 + boot_mem_map.map[i].size) { 159 + boot_mem_map.map[i].size) {
160 ClearPageReserved(virt_to_page(__va(addr))); 160 ClearPageReserved(virt_to_page(__va(addr)));
161 set_page_count(virt_to_page(__va(addr)), 1); 161 init_page_count(virt_to_page(__va(addr)));
162 free_page((unsigned long)__va(addr)); 162 free_page((unsigned long)__va(addr));
163 addr += PAGE_SIZE; 163 addr += PAGE_SIZE;
164 freed += PAGE_SIZE; 164 freed += PAGE_SIZE;
diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c
index 81cb5a76cfb7..1edaf3074ee9 100644
--- a/arch/mips/dec/prom/memory.c
+++ b/arch/mips/dec/prom/memory.c
@@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void)
118 addr = PAGE_SIZE; 118 addr = PAGE_SIZE;
119 while (addr < end) { 119 while (addr < end) {
120 ClearPageReserved(virt_to_page(__va(addr))); 120 ClearPageReserved(virt_to_page(__va(addr)));
121 set_page_count(virt_to_page(__va(addr)), 1); 121 init_page_count(virt_to_page(__va(addr)));
122 free_page((unsigned long)__va(addr)); 122 free_page((unsigned long)__va(addr));
123 addr += PAGE_SIZE; 123 addr += PAGE_SIZE;
124 } 124 }
diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c
index 2c8afd77a20b..ee5e70c95cf3 100644
--- a/arch/mips/mips-boards/generic/memory.c
+++ b/arch/mips/mips-boards/generic/memory.c
@@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void)
174 while (addr < boot_mem_map.map[i].addr 174 while (addr < boot_mem_map.map[i].addr
175 + boot_mem_map.map[i].size) { 175 + boot_mem_map.map[i].size) {
176 ClearPageReserved(virt_to_page(__va(addr))); 176 ClearPageReserved(virt_to_page(__va(addr)));
177 set_page_count(virt_to_page(__va(addr)), 1); 177 init_page_count(virt_to_page(__va(addr)));
178 free_page((unsigned long)__va(addr)); 178 free_page((unsigned long)__va(addr));
179 addr += PAGE_SIZE; 179 addr += PAGE_SIZE;
180 freed += PAGE_SIZE; 180 freed += PAGE_SIZE;
diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c
index 0dbd7435bb2a..1ec4e75656bd 100644
--- a/arch/mips/mips-boards/sim/sim_mem.c
+++ b/arch/mips/mips-boards/sim/sim_mem.c
@@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void)
117 while (addr < boot_mem_map.map[i].addr 117 while (addr < boot_mem_map.map[i].addr
118 + boot_mem_map.map[i].size) { 118 + boot_mem_map.map[i].size) {
119 ClearPageReserved(virt_to_page(__va(addr))); 119 ClearPageReserved(virt_to_page(__va(addr)));
120 set_page_count(virt_to_page(__va(addr)), 1); 120 init_page_count(virt_to_page(__va(addr)));
121 free_page((unsigned long)__va(addr)); 121 free_page((unsigned long)__va(addr));
122 addr += PAGE_SIZE; 122 addr += PAGE_SIZE;
123 freed += PAGE_SIZE; 123 freed += PAGE_SIZE;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 0ff9a348b843..52f7d59fe612 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask;
54 */ 54 */
55unsigned long setup_zero_pages(void) 55unsigned long setup_zero_pages(void)
56{ 56{
57 unsigned long order, size; 57 unsigned int order;
58 unsigned long size;
58 struct page *page; 59 struct page *page;
59 60
60 if (cpu_has_vce) 61 if (cpu_has_vce)
@@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void)
67 panic("Oh boy, that early out of memory?"); 68 panic("Oh boy, that early out of memory?");
68 69
69 page = virt_to_page(empty_zero_page); 70 page = virt_to_page(empty_zero_page);
71 split_page(page, order);
70 while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) { 72 while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) {
71 SetPageReserved(page); 73 SetPageReserved(page);
72 set_page_count(page, 1);
73 page++; 74 page++;
74 } 75 }
75 76
@@ -244,7 +245,7 @@ void __init mem_init(void)
244#ifdef CONFIG_LIMITED_DMA 245#ifdef CONFIG_LIMITED_DMA
245 set_page_address(page, lowmem_page_address(page)); 246 set_page_address(page, lowmem_page_address(page));
246#endif 247#endif
247 set_page_count(page, 1); 248 init_page_count(page);
248 __free_page(page); 249 __free_page(page);
249 totalhigh_pages++; 250 totalhigh_pages++;
250 } 251 }
@@ -291,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
291 292
292 for (; start < end; start += PAGE_SIZE) { 293 for (; start < end; start += PAGE_SIZE) {
293 ClearPageReserved(virt_to_page(start)); 294 ClearPageReserved(virt_to_page(start));
294 set_page_count(virt_to_page(start), 1); 295 init_page_count(virt_to_page(start));
295 free_page(start); 296 free_page(start);
296 totalram_pages++; 297 totalram_pages++;
297 } 298 }
@@ -314,7 +315,7 @@ void free_initmem(void)
314 page = addr; 315 page = addr;
315#endif 316#endif
316 ClearPageReserved(virt_to_page(page)); 317 ClearPageReserved(virt_to_page(page));
317 set_page_count(virt_to_page(page), 1); 318 init_page_count(virt_to_page(page));
318 free_page(page); 319 free_page(page);
319 totalram_pages++; 320 totalram_pages++;
320 freed += PAGE_SIZE; 321 freed += PAGE_SIZE;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index ed93a9792959..e0d095daa5ed 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -559,7 +559,7 @@ void __init mem_init(void)
559 /* if (!page_is_ram(pgnr)) continue; */ 559 /* if (!page_is_ram(pgnr)) continue; */
560 /* commented out until page_is_ram works */ 560 /* commented out until page_is_ram works */
561 ClearPageReserved(p); 561 ClearPageReserved(p);
562 set_page_count(p, 1); 562 init_page_count(p);
563 __free_page(p); 563 __free_page(p);
564 totalram_pages++; 564 totalram_pages++;
565 } 565 }
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 7847ca13d6c2..852eda3953dc 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -398,7 +398,7 @@ void free_initmem(void)
398 addr = (unsigned long)(&__init_begin); 398 addr = (unsigned long)(&__init_begin);
399 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 399 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
400 ClearPageReserved(virt_to_page(addr)); 400 ClearPageReserved(virt_to_page(addr));
401 set_page_count(virt_to_page(addr), 1); 401 init_page_count(virt_to_page(addr));
402 free_page(addr); 402 free_page(addr);
403 num_physpages++; 403 num_physpages++;
404 totalram_pages++; 404 totalram_pages++;
@@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
1018 printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 1018 printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
1019 for (; start < end; start += PAGE_SIZE) { 1019 for (; start < end; start += PAGE_SIZE) {
1020 ClearPageReserved(virt_to_page(start)); 1020 ClearPageReserved(virt_to_page(start));
1021 set_page_count(virt_to_page(start), 1); 1021 init_page_count(virt_to_page(start));
1022 free_page(start); 1022 free_page(start);
1023 num_physpages++; 1023 num_physpages++;
1024 totalram_pages++; 1024 totalram_pages++;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index b51bb28c054b..7370f9f33e29 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
133 return __pte(old); 133 return __pte(old);
134} 134}
135 135
136/*
137 * This function checks for proper alignment of input addr and len parameters.
138 */
139int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
140{
141 if (len & ~HPAGE_MASK)
142 return -EINVAL;
143 if (addr & ~HPAGE_MASK)
144 return -EINVAL;
145 if (! (within_hugepage_low_range(addr, len)
146 || within_hugepage_high_range(addr, len)) )
147 return -EINVAL;
148 return 0;
149}
150
151struct slb_flush_info { 136struct slb_flush_info {
152 struct mm_struct *mm; 137 struct mm_struct *mm;
153 u16 newareas; 138 u16 newareas;
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 7d0d75c11848..b57fb3a2b7bb 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
216 216
217 while (start < end) { 217 while (start < end) {
218 ClearPageReserved(virt_to_page(start)); 218 ClearPageReserved(virt_to_page(start));
219 set_page_count(virt_to_page(start), 1); 219 init_page_count(virt_to_page(start));
220 free_page(start); 220 free_page(start);
221 cnt++; 221 cnt++;
222 start += PAGE_SIZE; 222 start += PAGE_SIZE;
@@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
248 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 248 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
249 for (; start < end; start += PAGE_SIZE) { 249 for (; start < end; start += PAGE_SIZE) {
250 ClearPageReserved(virt_to_page(start)); 250 ClearPageReserved(virt_to_page(start));
251 set_page_count(virt_to_page(start), 1); 251 init_page_count(virt_to_page(start));
252 free_page(start); 252 free_page(start);
253 totalram_pages++; 253 totalram_pages++;
254 } 254 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 81cfb0c2ec58..bacb71c89811 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -140,7 +140,7 @@ void free_initmem(void)
140 for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { 140 for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
141 memset((void *)addr, 0xcc, PAGE_SIZE); 141 memset((void *)addr, 0xcc, PAGE_SIZE);
142 ClearPageReserved(virt_to_page(addr)); 142 ClearPageReserved(virt_to_page(addr));
143 set_page_count(virt_to_page(addr), 1); 143 init_page_count(virt_to_page(addr));
144 free_page(addr); 144 free_page(addr);
145 totalram_pages++; 145 totalram_pages++;
146 } 146 }
@@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
155 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 155 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
156 for (; start < end; start += PAGE_SIZE) { 156 for (; start < end; start += PAGE_SIZE) {
157 ClearPageReserved(virt_to_page(start)); 157 ClearPageReserved(virt_to_page(start));
158 set_page_count(virt_to_page(start), 1); 158 init_page_count(virt_to_page(start));
159 free_page(start); 159 free_page(start);
160 totalram_pages++; 160 totalram_pages++;
161 } 161 }
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 550517c2dd42..454cac01d8cc 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -108,8 +108,8 @@ EXPORT_SYMBOL(phys_mem_access_prot);
108void online_page(struct page *page) 108void online_page(struct page *page)
109{ 109{
110 ClearPageReserved(page); 110 ClearPageReserved(page);
111 set_page_count(page, 0); 111 init_page_count(page);
112 free_cold_page(page); 112 __free_page(page);
113 totalram_pages++; 113 totalram_pages++;
114 num_physpages++; 114 num_physpages++;
115} 115}
@@ -376,7 +376,7 @@ void __init mem_init(void)
376 struct page *page = pfn_to_page(pfn); 376 struct page *page = pfn_to_page(pfn);
377 377
378 ClearPageReserved(page); 378 ClearPageReserved(page);
379 set_page_count(page, 1); 379 init_page_count(page);
380 __free_page(page); 380 __free_page(page);
381 totalhigh_pages++; 381 totalhigh_pages++;
382 } 382 }
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index b33a4443f5a9..fec8e65b36ea 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe,
115 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 115 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
116 struct page *page = pfn_to_page(pfn); 116 struct page *page = pfn_to_page(pfn);
117 set_page_links(page, ZONE_DMA, node_id, pfn); 117 set_page_links(page, ZONE_DMA, node_id, pfn);
118 set_page_count(page, 1); 118 init_page_count(page);
119 reset_page_mapcount(page); 119 reset_page_mapcount(page);
120 SetPageReserved(page); 120 SetPageReserved(page);
121 INIT_LIST_HEAD(&page->lru); 121 INIT_LIST_HEAD(&page->lru);
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
index 685fd0defe23..61465ec88bc7 100644
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
223 pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); 223 pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
224 struct page *end = page + (1 << order); 224 struct page *end = page + (1 << order);
225 225
226 split_page(page, order);
227
226 /* 228 /*
227 * Set the "dma handle" 229 * Set the "dma handle"
228 */ 230 */
@@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
231 do { 233 do {
232 BUG_ON(!pte_none(*pte)); 234 BUG_ON(!pte_none(*pte));
233 235
234 set_page_count(page, 1);
235 SetPageReserved(page); 236 SetPageReserved(page);
236 set_pte_at(&init_mm, vaddr, 237 set_pte_at(&init_mm, vaddr,
237 pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); 238 pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
@@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
244 * Free the otherwise unused pages. 245 * Free the otherwise unused pages.
245 */ 246 */
246 while (page < end) { 247 while (page < end) {
247 set_page_count(page, 1);
248 __free_page(page); 248 __free_page(page);
249 page++; 249 page++;
250 } 250 }
diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c
index 134db5c04203..cb1c294fb932 100644
--- a/arch/ppc/mm/init.c
+++ b/arch/ppc/mm/init.c
@@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
140 140
141 while (start < end) { 141 while (start < end) {
142 ClearPageReserved(virt_to_page(start)); 142 ClearPageReserved(virt_to_page(start));
143 set_page_count(virt_to_page(start), 1); 143 init_page_count(virt_to_page(start));
144 free_page(start); 144 free_page(start);
145 cnt++; 145 cnt++;
146 start += PAGE_SIZE; 146 start += PAGE_SIZE;
@@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
172 172
173 for (; start < end; start += PAGE_SIZE) { 173 for (; start < end; start += PAGE_SIZE) {
174 ClearPageReserved(virt_to_page(start)); 174 ClearPageReserved(virt_to_page(start));
175 set_page_count(virt_to_page(start), 1); 175 init_page_count(virt_to_page(start));
176 free_page(start); 176 free_page(start);
177 totalram_pages++; 177 totalram_pages++;
178 } 178 }
@@ -441,7 +441,7 @@ void __init mem_init(void)
441 struct page *page = mem_map + pfn; 441 struct page *page = mem_map + pfn;
442 442
443 ClearPageReserved(page); 443 ClearPageReserved(page);
444 set_page_count(page, 1); 444 init_page_count(page);
445 __free_page(page); 445 __free_page(page);
446 totalhigh_pages++; 446 totalhigh_pages++;
447 } 447 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index df953383724d..a055894f3bd8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -292,7 +292,7 @@ void free_initmem(void)
292 addr = (unsigned long)(&__init_begin); 292 addr = (unsigned long)(&__init_begin);
293 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 293 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
294 ClearPageReserved(virt_to_page(addr)); 294 ClearPageReserved(virt_to_page(addr));
295 set_page_count(virt_to_page(addr), 1); 295 init_page_count(virt_to_page(addr));
296 free_page(addr); 296 free_page(addr);
297 totalram_pages++; 297 totalram_pages++;
298 } 298 }
@@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
307 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 307 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
308 for (; start < end; start += PAGE_SIZE) { 308 for (; start < end; start += PAGE_SIZE) {
309 ClearPageReserved(virt_to_page(start)); 309 ClearPageReserved(virt_to_page(start));
310 set_page_count(virt_to_page(start), 1); 310 init_page_count(virt_to_page(start));
311 free_page(start); 311 free_page(start);
312 totalram_pages++; 312 totalram_pages++;
313 } 313 }
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index df3a9e452cc5..ee73e30263af 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
23 page = alloc_pages(gfp, order); 23 page = alloc_pages(gfp, order);
24 if (!page) 24 if (!page)
25 return NULL; 25 return NULL;
26 split_page(page, order);
26 27
27 ret = page_address(page); 28 ret = page_address(page);
28 *handle = virt_to_phys(ret); 29 *handle = virt_to_phys(ret);
@@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
37 end = page + (1 << order); 38 end = page + (1 << order);
38 39
39 while (++page < end) { 40 while (++page < end) {
40 set_page_count(page, 1);
41
42 /* Free any unused pages */ 41 /* Free any unused pages */
43 if (page >= free) { 42 if (page >= free) {
44 __free_page(page); 43 __free_page(page);
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6b7a7688c98e..a3568fd51508 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
84 return entry; 84 return entry;
85} 85}
86 86
87/*
88 * This function checks for proper alignment of input addr and len parameters.
89 */
90int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
91{
92 if (len & ~HPAGE_MASK)
93 return -EINVAL;
94 if (addr & ~HPAGE_MASK)
95 return -EINVAL;
96 return 0;
97}
98
99struct page *follow_huge_addr(struct mm_struct *mm, 87struct page *follow_huge_addr(struct mm_struct *mm,
100 unsigned long address, int write) 88 unsigned long address, int write)
101{ 89{
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index e342565f75fb..77b4a838fe10 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -273,7 +273,7 @@ void free_initmem(void)
273 addr = (unsigned long)(&__init_begin); 273 addr = (unsigned long)(&__init_begin);
274 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 274 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
275 ClearPageReserved(virt_to_page(addr)); 275 ClearPageReserved(virt_to_page(addr));
276 set_page_count(virt_to_page(addr), 1); 276 init_page_count(virt_to_page(addr));
277 free_page(addr); 277 free_page(addr);
278 totalram_pages++; 278 totalram_pages++;
279 } 279 }
@@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
286 unsigned long p; 286 unsigned long p;
287 for (p = start; p < end; p += PAGE_SIZE) { 287 for (p = start; p < end; p += PAGE_SIZE) {
288 ClearPageReserved(virt_to_page(p)); 288 ClearPageReserved(virt_to_page(p));
289 set_page_count(virt_to_page(p), 1); 289 init_page_count(virt_to_page(p));
290 free_page(p); 290 free_page(p);
291 totalram_pages++; 291 totalram_pages++;
292 } 292 }
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index ed6a505b3ee2..3d89f2a6c785 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
84 return entry; 84 return entry;
85} 85}
86 86
87/*
88 * This function checks for proper alignment of input addr and len parameters.
89 */
90int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
91{
92 if (len & ~HPAGE_MASK)
93 return -EINVAL;
94 if (addr & ~HPAGE_MASK)
95 return -EINVAL;
96 return 0;
97}
98
99struct page *follow_huge_addr(struct mm_struct *mm, 87struct page *follow_huge_addr(struct mm_struct *mm,
100 unsigned long address, int write) 88 unsigned long address, int write)
101{ 89{
diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c
index a65e8bb2c3cc..1169757fb38b 100644
--- a/arch/sh64/mm/init.c
+++ b/arch/sh64/mm/init.c
@@ -173,7 +173,7 @@ void free_initmem(void)
173 addr = (unsigned long)(&__init_begin); 173 addr = (unsigned long)(&__init_begin);
174 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 174 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
175 ClearPageReserved(virt_to_page(addr)); 175 ClearPageReserved(virt_to_page(addr));
176 set_page_count(virt_to_page(addr), 1); 176 init_page_count(virt_to_page(addr));
177 free_page(addr); 177 free_page(addr);
178 totalram_pages++; 178 totalram_pages++;
179 } 179 }
@@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
186 unsigned long p; 186 unsigned long p;
187 for (p = start; p < end; p += PAGE_SIZE) { 187 for (p = start; p < end; p += PAGE_SIZE) {
188 ClearPageReserved(virt_to_page(p)); 188 ClearPageReserved(virt_to_page(p));
189 set_page_count(virt_to_page(p), 1); 189 init_page_count(virt_to_page(p));
190 free_page(p); 190 free_page(p);
191 totalram_pages++; 191 totalram_pages++;
192 } 192 }
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 40d426cce824..4219dd2ce3a2 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void)
266 266
267 /* Free unneeded trap tables */ 267 /* Free unneeded trap tables */
268 ClearPageReserved(virt_to_page(trapbase_cpu1)); 268 ClearPageReserved(virt_to_page(trapbase_cpu1));
269 set_page_count(virt_to_page(trapbase_cpu1), 1); 269 init_page_count(virt_to_page(trapbase_cpu1));
270 free_page((unsigned long)trapbase_cpu1); 270 free_page((unsigned long)trapbase_cpu1);
271 totalram_pages++; 271 totalram_pages++;
272 num_physpages++; 272 num_physpages++;
273 273
274 ClearPageReserved(virt_to_page(trapbase_cpu2)); 274 ClearPageReserved(virt_to_page(trapbase_cpu2));
275 set_page_count(virt_to_page(trapbase_cpu2), 1); 275 init_page_count(virt_to_page(trapbase_cpu2));
276 free_page((unsigned long)trapbase_cpu2); 276 free_page((unsigned long)trapbase_cpu2);
277 totalram_pages++; 277 totalram_pages++;
278 num_physpages++; 278 num_physpages++;
279 279
280 ClearPageReserved(virt_to_page(trapbase_cpu3)); 280 ClearPageReserved(virt_to_page(trapbase_cpu3));
281 set_page_count(virt_to_page(trapbase_cpu3), 1); 281 init_page_count(virt_to_page(trapbase_cpu3));
282 free_page((unsigned long)trapbase_cpu3); 282 free_page((unsigned long)trapbase_cpu3);
283 totalram_pages++; 283 totalram_pages++;
284 num_physpages++; 284 num_physpages++;
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index a21f27d10e55..fbbd8a474c4c 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void)
233 /* Free unneeded trap tables */ 233 /* Free unneeded trap tables */
234 if (!cpu_isset(i, cpu_present_map)) { 234 if (!cpu_isset(i, cpu_present_map)) {
235 ClearPageReserved(virt_to_page(trapbase_cpu1)); 235 ClearPageReserved(virt_to_page(trapbase_cpu1));
236 set_page_count(virt_to_page(trapbase_cpu1), 1); 236 init_page_count(virt_to_page(trapbase_cpu1));
237 free_page((unsigned long)trapbase_cpu1); 237 free_page((unsigned long)trapbase_cpu1);
238 totalram_pages++; 238 totalram_pages++;
239 num_physpages++; 239 num_physpages++;
240 } 240 }
241 if (!cpu_isset(2, cpu_present_map)) { 241 if (!cpu_isset(2, cpu_present_map)) {
242 ClearPageReserved(virt_to_page(trapbase_cpu2)); 242 ClearPageReserved(virt_to_page(trapbase_cpu2));
243 set_page_count(virt_to_page(trapbase_cpu2), 1); 243 init_page_count(virt_to_page(trapbase_cpu2));
244 free_page((unsigned long)trapbase_cpu2); 244 free_page((unsigned long)trapbase_cpu2);
245 totalram_pages++; 245 totalram_pages++;
246 num_physpages++; 246 num_physpages++;
247 } 247 }
248 if (!cpu_isset(3, cpu_present_map)) { 248 if (!cpu_isset(3, cpu_present_map)) {
249 ClearPageReserved(virt_to_page(trapbase_cpu3)); 249 ClearPageReserved(virt_to_page(trapbase_cpu3));
250 set_page_count(virt_to_page(trapbase_cpu3), 1); 250 init_page_count(virt_to_page(trapbase_cpu3));
251 free_page((unsigned long)trapbase_cpu3); 251 free_page((unsigned long)trapbase_cpu3);
252 totalram_pages++; 252 totalram_pages++;
253 num_physpages++; 253 num_physpages++;
diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c
index c03babaa0498..898669732466 100644
--- a/arch/sparc/mm/init.c
+++ b/arch/sparc/mm/init.c
@@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
383 struct page *page = pfn_to_page(tmp); 383 struct page *page = pfn_to_page(tmp);
384 384
385 ClearPageReserved(page); 385 ClearPageReserved(page);
386 set_page_count(page, 1); 386 init_page_count(page);
387 __free_page(page); 387 __free_page(page);
388 totalhigh_pages++; 388 totalhigh_pages++;
389 } 389 }
@@ -480,7 +480,7 @@ void free_initmem (void)
480 p = virt_to_page(addr); 480 p = virt_to_page(addr);
481 481
482 ClearPageReserved(p); 482 ClearPageReserved(p);
483 set_page_count(p, 1); 483 init_page_count(p);
484 __free_page(p); 484 __free_page(p);
485 totalram_pages++; 485 totalram_pages++;
486 num_physpages++; 486 num_physpages++;
@@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
497 struct page *p = virt_to_page(start); 497 struct page *p = virt_to_page(start);
498 498
499 ClearPageReserved(p); 499 ClearPageReserved(p);
500 set_page_count(p, 1); 500 init_page_count(p);
501 __free_page(p); 501 __free_page(p);
502 num_physpages++; 502 num_physpages++;
503 } 503 }
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index a7a24869d045..280dc7958a13 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -263,18 +263,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
263 return entry; 263 return entry;
264} 264}
265 265
266/*
267 * This function checks for proper alignment of input addr and len parameters.
268 */
269int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
270{
271 if (len & ~HPAGE_MASK)
272 return -EINVAL;
273 if (addr & ~HPAGE_MASK)
274 return -EINVAL;
275 return 0;
276}
277
278struct page *follow_huge_addr(struct mm_struct *mm, 266struct page *follow_huge_addr(struct mm_struct *mm,
279 unsigned long address, int write) 267 unsigned long address, int write)
280{ 268{
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index c2b556106fc1..2ae143ba50d8 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1461,7 +1461,7 @@ void free_initmem(void)
1461 p = virt_to_page(page); 1461 p = virt_to_page(page);
1462 1462
1463 ClearPageReserved(p); 1463 ClearPageReserved(p);
1464 set_page_count(p, 1); 1464 init_page_count(p);
1465 __free_page(p); 1465 __free_page(p);
1466 num_physpages++; 1466 num_physpages++;
1467 totalram_pages++; 1467 totalram_pages++;
@@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
1477 struct page *p = virt_to_page(start); 1477 struct page *p = virt_to_page(start);
1478 1478
1479 ClearPageReserved(p); 1479 ClearPageReserved(p);
1480 set_page_count(p, 1); 1480 init_page_count(p);
1481 __free_page(p); 1481 __free_page(p);
1482 num_physpages++; 1482 num_physpages++;
1483 totalram_pages++; 1483 totalram_pages++;
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index fa4f915be5c5..92cce96b5e24 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start,
57 for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){ 57 for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){
58 page = &mem_map[highmem_pfn + i]; 58 page = &mem_map[highmem_pfn + i];
59 ClearPageReserved(page); 59 ClearPageReserved(page);
60 set_page_count(page, 1); 60 init_page_count(page);
61 __free_page(page); 61 __free_page(page);
62 } 62 }
63} 63}
@@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
296 (end - start) >> 10); 296 (end - start) >> 10);
297 for (; start < end; start += PAGE_SIZE) { 297 for (; start < end; start += PAGE_SIZE) {
298 ClearPageReserved(virt_to_page(start)); 298 ClearPageReserved(virt_to_page(start));
299 set_page_count(virt_to_page(start), 1); 299 init_page_count(virt_to_page(start));
300 free_page(start); 300 free_page(start);
301 totalram_pages++; 301 totalram_pages++;
302 } 302 }
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index 544665e04513..0e65340eee33 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -279,7 +279,7 @@ int init_maps(unsigned long physmem, unsigned long iomem, unsigned long highmem)
279 279
280 for(i = 0; i < total_pages; i++){ 280 for(i = 0; i < total_pages; i++){
281 p = &map[i]; 281 p = &map[i];
282 set_page_count(p, 0); 282 memset(p, 0, sizeof(struct page));
283 SetPageReserved(p); 283 SetPageReserved(p);
284 INIT_LIST_HEAD(&p->lru); 284 INIT_LIST_HEAD(&p->lru);
285 } 285 }
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 3080f84bf7b7..ee5ce3d3cbc3 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -477,7 +477,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
477 return IRQ_HANDLED; 477 return IRQ_HANDLED;
478} 478}
479 479
480static unsigned int cyc2ns_scale; 480static unsigned int cyc2ns_scale __read_mostly;
481#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 481#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
482 482
483static inline void set_cyc2ns_scale(unsigned long cpu_khz) 483static inline void set_cyc2ns_scale(unsigned long cpu_khz)
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 3496abc8d372..c9dc7e46731e 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -124,6 +124,7 @@ extern void * __memcpy(void *,const void *,__kernel_size_t);
124 124
125EXPORT_SYMBOL(memset); 125EXPORT_SYMBOL(memset);
126EXPORT_SYMBOL(strlen); 126EXPORT_SYMBOL(strlen);
127EXPORT_SYMBOL(strpbrk);
127EXPORT_SYMBOL(memmove); 128EXPORT_SYMBOL(memmove);
128EXPORT_SYMBOL(memcpy); 129EXPORT_SYMBOL(memcpy);
129EXPORT_SYMBOL(__memcpy); 130EXPORT_SYMBOL(__memcpy);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 7af1742aa958..40ed13d263cd 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
486void online_page(struct page *page) 486void online_page(struct page *page)
487{ 487{
488 ClearPageReserved(page); 488 ClearPageReserved(page);
489 set_page_count(page, 1); 489 init_page_count(page);
490 __free_page(page); 490 __free_page(page);
491 totalram_pages++; 491 totalram_pages++;
492 num_physpages++; 492 num_physpages++;
@@ -592,7 +592,7 @@ void free_initmem(void)
592 addr = (unsigned long)(&__init_begin); 592 addr = (unsigned long)(&__init_begin);
593 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 593 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
594 ClearPageReserved(virt_to_page(addr)); 594 ClearPageReserved(virt_to_page(addr));
595 set_page_count(virt_to_page(addr), 1); 595 init_page_count(virt_to_page(addr));
596 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 596 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
597 free_page(addr); 597 free_page(addr);
598 totalram_pages++; 598 totalram_pages++;
@@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
632 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 632 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
633 for (; start < end; start += PAGE_SIZE) { 633 for (; start < end; start += PAGE_SIZE) {
634 ClearPageReserved(virt_to_page(start)); 634 ClearPageReserved(virt_to_page(start));
635 set_page_count(virt_to_page(start), 1); 635 init_page_count(virt_to_page(start));
636 free_page(start); 636 free_page(start);
637 totalram_pages++; 637 totalram_pages++;
638 } 638 }
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 35f1f1aab063..531ad21447b1 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -45,6 +45,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
45 pte_t *pbase; 45 pte_t *pbase;
46 if (!base) 46 if (!base)
47 return NULL; 47 return NULL;
48 /*
49 * page_private is used to track the number of entries in
50 * the page table page have non standard attributes.
51 */
52 SetPagePrivate(base);
53 page_private(base) = 0;
54
48 address = __pa(address); 55 address = __pa(address);
49 addr = address & LARGE_PAGE_MASK; 56 addr = address & LARGE_PAGE_MASK;
50 pbase = (pte_t *)page_address(base); 57 pbase = (pte_t *)page_address(base);
@@ -77,26 +84,12 @@ static inline void flush_map(unsigned long address)
77 on_each_cpu(flush_kernel_map, (void *)address, 1, 1); 84 on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
78} 85}
79 86
80struct deferred_page { 87static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
81 struct deferred_page *next;
82 struct page *fpage;
83 unsigned long address;
84};
85static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
86 88
87static inline void save_page(unsigned long address, struct page *fpage) 89static inline void save_page(struct page *fpage)
88{ 90{
89 struct deferred_page *df; 91 fpage->lru.next = (struct list_head *)deferred_pages;
90 df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 92 deferred_pages = fpage;
91 if (!df) {
92 flush_map(address);
93 __free_page(fpage);
94 } else {
95 df->next = df_list;
96 df->fpage = fpage;
97 df->address = address;
98 df_list = df;
99 }
100} 93}
101 94
102/* 95/*
@@ -138,8 +131,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
138 set_pte(kpte, pfn_pte(pfn, prot)); 131 set_pte(kpte, pfn_pte(pfn, prot));
139 } else { 132 } else {
140 /* 133 /*
141 * split_large_page will take the reference for this change_page_attr 134 * split_large_page will take the reference for this
142 * on the split page. 135 * change_page_attr on the split page.
143 */ 136 */
144 137
145 struct page *split; 138 struct page *split;
@@ -151,23 +144,20 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
151 set_pte(kpte,mk_pte(split, ref_prot2)); 144 set_pte(kpte,mk_pte(split, ref_prot2));
152 kpte_page = split; 145 kpte_page = split;
153 } 146 }
154 get_page(kpte_page); 147 page_private(kpte_page)++;
155 } else if ((kpte_flags & _PAGE_PSE) == 0) { 148 } else if ((kpte_flags & _PAGE_PSE) == 0) {
156 set_pte(kpte, pfn_pte(pfn, ref_prot)); 149 set_pte(kpte, pfn_pte(pfn, ref_prot));
157 __put_page(kpte_page); 150 BUG_ON(page_private(kpte_page) == 0);
151 page_private(kpte_page)--;
158 } else 152 } else
159 BUG(); 153 BUG();
160 154
161 /* on x86-64 the direct mapping set at boot is not using 4k pages */ 155 /* on x86-64 the direct mapping set at boot is not using 4k pages */
162 BUG_ON(PageReserved(kpte_page)); 156 BUG_ON(PageReserved(kpte_page));
163 157
164 switch (page_count(kpte_page)) { 158 if (page_private(kpte_page) == 0) {
165 case 1: 159 save_page(kpte_page);
166 save_page(address, kpte_page);
167 revert_page(address, ref_prot); 160 revert_page(address, ref_prot);
168 break;
169 case 0:
170 BUG(); /* memleak and failed 2M page regeneration */
171 } 161 }
172 return 0; 162 return 0;
173} 163}
@@ -220,17 +210,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot)
220 210
221void global_flush_tlb(void) 211void global_flush_tlb(void)
222{ 212{
223 struct deferred_page *df, *next_df; 213 struct page *dpage;
224 214
225 down_read(&init_mm.mmap_sem); 215 down_read(&init_mm.mmap_sem);
226 df = xchg(&df_list, NULL); 216 dpage = xchg(&deferred_pages, NULL);
227 up_read(&init_mm.mmap_sem); 217 up_read(&init_mm.mmap_sem);
228 flush_map((df && !df->next) ? df->address : 0); 218
229 for (; df; df = next_df) { 219 flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
230 next_df = df->next; 220 while (dpage) {
231 if (df->fpage) 221 struct page *tmp = dpage;
232 __free_page(df->fpage); 222 dpage = (struct page *)dpage->lru.next;
233 kfree(df); 223 ClearPagePrivate(tmp);
224 __free_page(tmp);
234 } 225 }
235} 226}
236 227
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 5a91d6c9e66d..e1be4235f367 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end)
272{ 272{
273 for (; start < end; start += PAGE_SIZE) { 273 for (; start < end; start += PAGE_SIZE) {
274 ClearPageReserved(virt_to_page(start)); 274 ClearPageReserved(virt_to_page(start));
275 set_page_count(virt_to_page(start), 1); 275 init_page_count(virt_to_page(start));
276 free_page((unsigned long)start); 276 free_page((unsigned long)start);
277 totalram_pages++; 277 totalram_pages++;
278 } 278 }
diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c
index e5e119c820e4..7d28914d11cb 100644
--- a/arch/xtensa/mm/pgtable.c
+++ b/arch/xtensa/mm/pgtable.c
@@ -14,25 +14,21 @@
14 14
15pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 15pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
16{ 16{
17 pte_t *pte, p; 17 pte_t *pte = NULL, *p;
18 int color = ADDR_COLOR(address); 18 int color = ADDR_COLOR(address);
19 int i; 19 int i;
20 20
21 p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER); 21 p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER);
22 22
23 if (likely(p)) { 23 if (likely(p)) {
24 struct page *page; 24 split_page(virt_to_page(p), COLOR_ORDER);
25
26 for (i = 0; i < COLOR_SIZE; i++, p++) {
27 page = virt_to_page(pte);
28
29 set_page_count(page, 1);
30 ClearPageCompound(page);
31 25
26 for (i = 0; i < COLOR_SIZE; i++) {
32 if (ADDR_COLOR(p) == color) 27 if (ADDR_COLOR(p) == color)
33 pte = p; 28 pte = p;
34 else 29 else
35 free_page(p); 30 free_page(p);
31 p += PTRS_PER_PTE;
36 } 32 }
37 clear_page(pte); 33 clear_page(pte);
38 } 34 }
@@ -49,20 +45,20 @@ int flush;
49 45
50struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address) 46struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address)
51{ 47{
52 struct page *page, p; 48 struct page *page = NULL, *p;
53 int color = ADDR_COLOR(address); 49 int color = ADDR_COLOR(address);
54 50
55 p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER); 51 p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER);
56 52
57 if (likely(p)) { 53 if (likely(p)) {
58 for (i = 0; i < PAGE_ORDER; i++) { 54 split_page(p, COLOR_ORDER);
59 set_page_count(p, 1);
60 ClearPageCompound(p);
61 55
62 if (PADDR_COLOR(page_address(pg)) == color) 56 for (i = 0; i < PAGE_ORDER; i++) {
57 if (PADDR_COLOR(page_address(p)) == color)
63 page = p; 58 page = p;
64 else 59 else
65 free_page(p); 60 __free_page(p);
61 p++;
66 } 62 }
67 clear_highpage(page); 63 clear_highpage(page);
68 } 64 }
diff --git a/drivers/char/snsc.h b/drivers/char/snsc.h
index a9efc13cc858..8a98169b60c1 100644
--- a/drivers/char/snsc.h
+++ b/drivers/char/snsc.h
@@ -5,7 +5,7 @@
5 * License. See the file "COPYING" in the main directory of this archive 5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details. 6 * for more details.
7 * 7 *
8 * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11/* 11/*
@@ -70,6 +70,9 @@ struct sysctl_data_s {
70#define EV_CLASS_TEST_WARNING 0x6000ul 70#define EV_CLASS_TEST_WARNING 0x6000ul
71#define EV_CLASS_PWRD_NOTIFY 0x8000ul 71#define EV_CLASS_PWRD_NOTIFY 0x8000ul
72 72
73/* ENV class codes */
74#define ENV_PWRDN_PEND 0x4101ul
75
73#define EV_SEVERITY_POWER_STABLE 0x0000ul 76#define EV_SEVERITY_POWER_STABLE 0x0000ul
74#define EV_SEVERITY_POWER_LOW_WARNING 0x0100ul 77#define EV_SEVERITY_POWER_LOW_WARNING 0x0100ul
75#define EV_SEVERITY_POWER_HIGH_WARNING 0x0200ul 78#define EV_SEVERITY_POWER_HIGH_WARNING 0x0200ul
diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c
index baaa365285fa..a4fa507eed9e 100644
--- a/drivers/char/snsc_event.c
+++ b/drivers/char/snsc_event.c
@@ -5,7 +5,7 @@
5 * License. See the file "COPYING" in the main directory of this archive 5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details. 6 * for more details.
7 * 7 *
8 * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11/* 11/*
@@ -187,7 +187,8 @@ scdrv_event_severity(int code)
187static void 187static void
188scdrv_dispatch_event(char *event, int len) 188scdrv_dispatch_event(char *event, int len)
189{ 189{
190 int code, esp_code, src; 190 static int snsc_shutting_down = 0;
191 int code, esp_code, src, class;
191 char desc[CHUNKSIZE]; 192 char desc[CHUNKSIZE];
192 char *severity; 193 char *severity;
193 194
@@ -199,9 +200,25 @@ scdrv_dispatch_event(char *event, int len)
199 /* how urgent is the message? */ 200 /* how urgent is the message? */
200 severity = scdrv_event_severity(code); 201 severity = scdrv_event_severity(code);
201 202
202 if ((code & EV_CLASS_MASK) == EV_CLASS_PWRD_NOTIFY) { 203 class = (code & EV_CLASS_MASK);
204
205 if (class == EV_CLASS_PWRD_NOTIFY || code == ENV_PWRDN_PEND) {
203 struct task_struct *p; 206 struct task_struct *p;
204 207
208 if (snsc_shutting_down)
209 return;
210
211 snsc_shutting_down = 1;
212
213 /* give a message for each type of event */
214 if (class == EV_CLASS_PWRD_NOTIFY)
215 printk(KERN_NOTICE "Power off indication received."
216 " Sending SIGPWR to init...\n");
217 else if (code == ENV_PWRDN_PEND)
218 printk(KERN_CRIT "WARNING: Shutting down the system"
219 " due to a critical environmental condition."
220 " Sending SIGPWR to init...\n");
221
205 /* give a SIGPWR signal to init proc */ 222 /* give a SIGPWR signal to init proc */
206 223
207 /* first find init's task */ 224 /* first find init's task */
@@ -210,12 +227,11 @@ scdrv_dispatch_event(char *event, int len)
210 if (p->pid == 1) 227 if (p->pid == 1)
211 break; 228 break;
212 } 229 }
213 if (p) { /* we found init's task */ 230 if (p) {
214 printk(KERN_EMERG "Power off indication received. Initiating power fail sequence...\n");
215 force_sig(SIGPWR, p); 231 force_sig(SIGPWR, p);
216 } else { /* failed to find init's task - just give message(s) */ 232 } else {
217 printk(KERN_WARNING "Failed to find init proc to handle power off!\n"); 233 printk(KERN_ERR "Failed to signal init!\n");
218 printk("%s|$(0x%x)%s\n", severity, esp_code, desc); 234 snsc_shutting_down = 0; /* so can try again (?) */
219 } 235 }
220 read_unlock(&tasklist_lock); 236 read_unlock(&tasklist_lock);
221 } else { 237 } else {
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index ac2a297ce37c..a80c83210872 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -283,7 +283,7 @@ static void tb0219_pci_irq_init(void)
283 vr41xx_set_irq_level(TB0219_PCI_SLOT3_PIN, IRQ_LEVEL_LOW); 283 vr41xx_set_irq_level(TB0219_PCI_SLOT3_PIN, IRQ_LEVEL_LOW);
284} 284}
285 285
286static int tb0219_probe(struct platform_device *dev) 286static int __devinit tb0219_probe(struct platform_device *dev)
287{ 287{
288 int retval; 288 int retval;
289 289
@@ -319,7 +319,7 @@ static int tb0219_probe(struct platform_device *dev)
319 return 0; 319 return 0;
320} 320}
321 321
322static int tb0219_remove(struct platform_device *dev) 322static int __devexit tb0219_remove(struct platform_device *dev)
323{ 323{
324 _machine_restart = old_machine_restart; 324 _machine_restart = old_machine_restart;
325 325
@@ -335,19 +335,26 @@ static struct platform_device *tb0219_platform_device;
335 335
336static struct platform_driver tb0219_device_driver = { 336static struct platform_driver tb0219_device_driver = {
337 .probe = tb0219_probe, 337 .probe = tb0219_probe,
338 .remove = tb0219_remove, 338 .remove = __devexit_p(tb0219_remove),
339 .driver = { 339 .driver = {
340 .name = "TB0219", 340 .name = "TB0219",
341 .owner = THIS_MODULE,
341 }, 342 },
342}; 343};
343 344
344static int __devinit tanbac_tb0219_init(void) 345static int __init tanbac_tb0219_init(void)
345{ 346{
346 int retval; 347 int retval;
347 348
348 tb0219_platform_device = platform_device_register_simple("TB0219", -1, NULL, 0); 349 tb0219_platform_device = platform_device_alloc("TB0219", -1);
349 if (IS_ERR(tb0219_platform_device)) 350 if (!tb0219_platform_device)
350 return PTR_ERR(tb0219_platform_device); 351 return -ENOMEM;
352
353 retval = platform_device_add(tb0219_platform_device);
354 if (retval < 0) {
355 platform_device_put(tb0219_platform_device);
356 return retval;
357 }
351 358
352 retval = platform_driver_register(&tb0219_device_driver); 359 retval = platform_driver_register(&tb0219_device_driver);
353 if (retval < 0) 360 if (retval < 0)
@@ -356,10 +363,9 @@ static int __devinit tanbac_tb0219_init(void)
356 return retval; 363 return retval;
357} 364}
358 365
359static void __devexit tanbac_tb0219_exit(void) 366static void __exit tanbac_tb0219_exit(void)
360{ 367{
361 platform_driver_unregister(&tb0219_device_driver); 368 platform_driver_unregister(&tb0219_device_driver);
362
363 platform_device_unregister(tb0219_platform_device); 369 platform_device_unregister(tb0219_platform_device);
364} 370}
365 371
diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c
index 2267c7b81799..05e6e814d86f 100644
--- a/drivers/char/vr41xx_giu.c
+++ b/drivers/char/vr41xx_giu.c
@@ -613,7 +613,7 @@ static struct file_operations gpio_fops = {
613 .release = gpio_release, 613 .release = gpio_release,
614}; 614};
615 615
616static int giu_probe(struct platform_device *dev) 616static int __devinit giu_probe(struct platform_device *dev)
617{ 617{
618 unsigned long start, size, flags = 0; 618 unsigned long start, size, flags = 0;
619 unsigned int nr_pins = 0; 619 unsigned int nr_pins = 0;
@@ -697,7 +697,7 @@ static int giu_probe(struct platform_device *dev)
697 return cascade_irq(GIUINT_IRQ, giu_get_irq); 697 return cascade_irq(GIUINT_IRQ, giu_get_irq);
698} 698}
699 699
700static int giu_remove(struct platform_device *dev) 700static int __devexit giu_remove(struct platform_device *dev)
701{ 701{
702 iounmap(giu_base); 702 iounmap(giu_base);
703 703
@@ -712,9 +712,10 @@ static struct platform_device *giu_platform_device;
712 712
713static struct platform_driver giu_device_driver = { 713static struct platform_driver giu_device_driver = {
714 .probe = giu_probe, 714 .probe = giu_probe,
715 .remove = giu_remove, 715 .remove = __devexit_p(giu_remove),
716 .driver = { 716 .driver = {
717 .name = "GIU", 717 .name = "GIU",
718 .owner = THIS_MODULE,
718 }, 719 },
719}; 720};
720 721
@@ -722,9 +723,15 @@ static int __init vr41xx_giu_init(void)
722{ 723{
723 int retval; 724 int retval;
724 725
725 giu_platform_device = platform_device_register_simple("GIU", -1, NULL, 0); 726 giu_platform_device = platform_device_alloc("GIU", -1);
726 if (IS_ERR(giu_platform_device)) 727 if (!giu_platform_device)
727 return PTR_ERR(giu_platform_device); 728 return -ENOMEM;
729
730 retval = platform_device_add(giu_platform_device);
731 if (retval < 0) {
732 platform_device_put(giu_platform_device);
733 return retval;
734 }
728 735
729 retval = platform_driver_register(&giu_device_driver); 736 retval = platform_driver_register(&giu_device_driver);
730 if (retval < 0) 737 if (retval < 0)
diff --git a/drivers/char/vr41xx_rtc.c b/drivers/char/vr41xx_rtc.c
index bc1b4a15212c..b109d9a502d6 100644
--- a/drivers/char/vr41xx_rtc.c
+++ b/drivers/char/vr41xx_rtc.c
@@ -558,7 +558,7 @@ static struct miscdevice rtc_miscdevice = {
558 .fops = &rtc_fops, 558 .fops = &rtc_fops,
559}; 559};
560 560
561static int rtc_probe(struct platform_device *pdev) 561static int __devinit rtc_probe(struct platform_device *pdev)
562{ 562{
563 unsigned int irq; 563 unsigned int irq;
564 int retval; 564 int retval;
@@ -631,7 +631,7 @@ static int rtc_probe(struct platform_device *pdev)
631 return 0; 631 return 0;
632} 632}
633 633
634static int rtc_remove(struct platform_device *dev) 634static int __devexit rtc_remove(struct platform_device *dev)
635{ 635{
636 int retval; 636 int retval;
637 637
@@ -653,13 +653,14 @@ static struct platform_device *rtc_platform_device;
653 653
654static struct platform_driver rtc_device_driver = { 654static struct platform_driver rtc_device_driver = {
655 .probe = rtc_probe, 655 .probe = rtc_probe,
656 .remove = rtc_remove, 656 .remove = __devexit_p(rtc_remove),
657 .driver = { 657 .driver = {
658 .name = rtc_name, 658 .name = rtc_name,
659 .owner = THIS_MODULE,
659 }, 660 },
660}; 661};
661 662
662static int __devinit vr41xx_rtc_init(void) 663static int __init vr41xx_rtc_init(void)
663{ 664{
664 int retval; 665 int retval;
665 666
@@ -684,10 +685,20 @@ static int __devinit vr41xx_rtc_init(void)
684 break; 685 break;
685 } 686 }
686 687
687 rtc_platform_device = platform_device_register_simple("RTC", -1, 688 rtc_platform_device = platform_device_alloc("RTC", -1);
688 rtc_resource, ARRAY_SIZE(rtc_resource)); 689 if (!rtc_platform_device)
689 if (IS_ERR(rtc_platform_device)) 690 return -ENOMEM;
690 return PTR_ERR(rtc_platform_device); 691
692 retval = platform_device_add_resources(rtc_platform_device,
693 rtc_resource, ARRAY_SIZE(rtc_resource));
694
695 if (retval == 0)
696 retval = platform_device_add(rtc_platform_device);
697
698 if (retval < 0) {
699 platform_device_put(rtc_platform_device);
700 return retval;
701 }
691 702
692 retval = platform_driver_register(&rtc_device_driver); 703 retval = platform_driver_register(&rtc_device_driver);
693 if (retval < 0) 704 if (retval < 0)
@@ -696,10 +707,9 @@ static int __devinit vr41xx_rtc_init(void)
696 return retval; 707 return retval;
697} 708}
698 709
699static void __devexit vr41xx_rtc_exit(void) 710static void __exit vr41xx_rtc_exit(void)
700{ 711{
701 platform_driver_unregister(&rtc_device_driver); 712 platform_driver_unregister(&rtc_device_driver);
702
703 platform_device_unregister(rtc_platform_device); 713 platform_device_unregister(rtc_platform_device);
704} 714}
705 715
diff --git a/drivers/char/watchdog/mv64x60_wdt.c b/drivers/char/watchdog/mv64x60_wdt.c
index 00d9ef04a369..f1b9cf89f153 100644
--- a/drivers/char/watchdog/mv64x60_wdt.c
+++ b/drivers/char/watchdog/mv64x60_wdt.c
@@ -228,15 +228,25 @@ static int __init mv64x60_wdt_init(void)
228 228
229 printk(KERN_INFO "MV64x60 watchdog driver\n"); 229 printk(KERN_INFO "MV64x60 watchdog driver\n");
230 230
231 mv64x60_wdt_dev = platform_device_register_simple(MV64x60_WDT_NAME, 231 mv64x60_wdt_dev = platform_device_alloc(MV64x60_WDT_NAME, -1);
232 -1, NULL, 0); 232 if (!mv64x60_wdt_dev) {
233 if (IS_ERR(mv64x60_wdt_dev)) { 233 ret = -ENOMEM;
234 ret = PTR_ERR(mv64x60_wdt_dev); 234 goto out;
235 }
236
237 ret = platform_device_add(mv64x60_wdt_dev);
238 if (ret) {
239 platform_device_put(mv64x60_wdt_dev);
235 goto out; 240 goto out;
236 } 241 }
237 242
238 ret = platform_driver_register(&mv64x60_wdt_driver); 243 ret = platform_driver_register(&mv64x60_wdt_driver);
239 out: 244 if (ret) {
245 platform_device_unregister(mv64x60_wdt_dev);
246 goto out;
247 }
248
249 out:
240 return ret; 250 return ret;
241} 251}
242 252
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 4652512f7d1a..3a4e5c5b4e1f 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -530,30 +530,27 @@ static DCDBAS_DEV_ATTR_RW(host_control_action);
530static DCDBAS_DEV_ATTR_RW(host_control_smi_type); 530static DCDBAS_DEV_ATTR_RW(host_control_smi_type);
531static DCDBAS_DEV_ATTR_RW(host_control_on_shutdown); 531static DCDBAS_DEV_ATTR_RW(host_control_on_shutdown);
532 532
533static struct device_attribute *dcdbas_dev_attrs[] = { 533static struct attribute *dcdbas_dev_attrs[] = {
534 &dev_attr_smi_data_buf_size, 534 &dev_attr_smi_data_buf_size.attr,
535 &dev_attr_smi_data_buf_phys_addr, 535 &dev_attr_smi_data_buf_phys_addr.attr,
536 &dev_attr_smi_request, 536 &dev_attr_smi_request.attr,
537 &dev_attr_host_control_action, 537 &dev_attr_host_control_action.attr,
538 &dev_attr_host_control_smi_type, 538 &dev_attr_host_control_smi_type.attr,
539 &dev_attr_host_control_on_shutdown, 539 &dev_attr_host_control_on_shutdown.attr,
540 NULL 540 NULL
541}; 541};
542 542
543/** 543static struct attribute_group dcdbas_attr_group = {
544 * dcdbas_init: initialize driver 544 .attrs = dcdbas_dev_attrs,
545 */ 545};
546static int __init dcdbas_init(void) 546
547static int __devinit dcdbas_probe(struct platform_device *dev)
547{ 548{
548 int i; 549 int i, error;
549 550
550 host_control_action = HC_ACTION_NONE; 551 host_control_action = HC_ACTION_NONE;
551 host_control_smi_type = HC_SMITYPE_NONE; 552 host_control_smi_type = HC_SMITYPE_NONE;
552 553
553 dcdbas_pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
554 if (IS_ERR(dcdbas_pdev))
555 return PTR_ERR(dcdbas_pdev);
556
557 /* 554 /*
558 * BIOS SMI calls require buffer addresses be in 32-bit address space. 555 * BIOS SMI calls require buffer addresses be in 32-bit address space.
559 * This is done by setting the DMA mask below. 556 * This is done by setting the DMA mask below.
@@ -561,19 +558,79 @@ static int __init dcdbas_init(void)
561 dcdbas_pdev->dev.coherent_dma_mask = DMA_32BIT_MASK; 558 dcdbas_pdev->dev.coherent_dma_mask = DMA_32BIT_MASK;
562 dcdbas_pdev->dev.dma_mask = &dcdbas_pdev->dev.coherent_dma_mask; 559 dcdbas_pdev->dev.dma_mask = &dcdbas_pdev->dev.coherent_dma_mask;
563 560
561 error = sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group);
562 if (error)
563 return error;
564
565 for (i = 0; dcdbas_bin_attrs[i]; i++) {
566 error = sysfs_create_bin_file(&dev->dev.kobj,
567 dcdbas_bin_attrs[i]);
568 if (error) {
569 while (--i >= 0)
570 sysfs_remove_bin_file(&dev->dev.kobj,
571 dcdbas_bin_attrs[i]);
572 sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group);
573 return error;
574 }
575 }
576
564 register_reboot_notifier(&dcdbas_reboot_nb); 577 register_reboot_notifier(&dcdbas_reboot_nb);
565 578
579 dev_info(&dev->dev, "%s (version %s)\n",
580 DRIVER_DESCRIPTION, DRIVER_VERSION);
581
582 return 0;
583}
584
585static int __devexit dcdbas_remove(struct platform_device *dev)
586{
587 int i;
588
589 unregister_reboot_notifier(&dcdbas_reboot_nb);
566 for (i = 0; dcdbas_bin_attrs[i]; i++) 590 for (i = 0; dcdbas_bin_attrs[i]; i++)
567 sysfs_create_bin_file(&dcdbas_pdev->dev.kobj, 591 sysfs_remove_bin_file(&dev->dev.kobj, dcdbas_bin_attrs[i]);
568 dcdbas_bin_attrs[i]); 592 sysfs_remove_group(&dev->dev.kobj, &dcdbas_attr_group);
569 593
570 for (i = 0; dcdbas_dev_attrs[i]; i++) 594 return 0;
571 device_create_file(&dcdbas_pdev->dev, dcdbas_dev_attrs[i]); 595}
572 596
573 dev_info(&dcdbas_pdev->dev, "%s (version %s)\n", 597static struct platform_driver dcdbas_driver = {
574 DRIVER_DESCRIPTION, DRIVER_VERSION); 598 .driver = {
599 .name = DRIVER_NAME,
600 .owner = THIS_MODULE,
601 },
602 .probe = dcdbas_probe,
603 .remove = __devexit_p(dcdbas_remove),
604};
605
606/**
607 * dcdbas_init: initialize driver
608 */
609static int __init dcdbas_init(void)
610{
611 int error;
612
613 error = platform_driver_register(&dcdbas_driver);
614 if (error)
615 return error;
616
617 dcdbas_pdev = platform_device_alloc(DRIVER_NAME, -1);
618 if (!dcdbas_pdev) {
619 error = -ENOMEM;
620 goto err_unregister_driver;
621 }
622
623 error = platform_device_add(dcdbas_pdev);
624 if (error)
625 goto err_free_device;
575 626
576 return 0; 627 return 0;
628
629 err_free_device:
630 platform_device_put(dcdbas_pdev);
631 err_unregister_driver:
632 platform_driver_unregister(&dcdbas_driver);
633 return error;
577} 634}
578 635
579/** 636/**
@@ -588,6 +645,15 @@ static void __exit dcdbas_exit(void)
588 unregister_reboot_notifier(&dcdbas_reboot_nb); 645 unregister_reboot_notifier(&dcdbas_reboot_nb);
589 smi_data_buf_free(); 646 smi_data_buf_free();
590 platform_device_unregister(dcdbas_pdev); 647 platform_device_unregister(dcdbas_pdev);
648 platform_driver_unregister(&dcdbas_driver);
649
650 /*
651 * We have to free the buffer here instead of dcdbas_remove
652 * because only in module exit function we can be sure that
653 * all sysfs attributes belonging to this module have been
654 * released.
655 */
656 smi_data_buf_free();
591} 657}
592 658
593module_init(dcdbas_init); 659module_init(dcdbas_init);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 88d60202b9db..26b08ee425c7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -533,30 +533,35 @@ static void __clone_and_map(struct clone_info *ci)
533 533
534 } else { 534 } else {
535 /* 535 /*
536 * Create two copy bios to deal with io that has 536 * Handle a bvec that must be split between two or more targets.
537 * been split across a target.
538 */ 537 */
539 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 538 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
539 sector_t remaining = to_sector(bv->bv_len);
540 unsigned int offset = 0;
540 541
541 clone = split_bvec(bio, ci->sector, ci->idx, 542 do {
542 bv->bv_offset, max); 543 if (offset) {
543 __map_bio(ti, clone, tio); 544 ti = dm_table_find_target(ci->map, ci->sector);
545 max = max_io_len(ci->md, ci->sector, ti);
544 546
545 ci->sector += max; 547 tio = alloc_tio(ci->md);
546 ci->sector_count -= max; 548 tio->io = ci->io;
547 ti = dm_table_find_target(ci->map, ci->sector); 549 tio->ti = ti;
548 550 memset(&tio->info, 0, sizeof(tio->info));
549 len = to_sector(bv->bv_len) - max; 551 }
550 clone = split_bvec(bio, ci->sector, ci->idx, 552
551 bv->bv_offset + to_bytes(max), len); 553 len = min(remaining, max);
552 tio = alloc_tio(ci->md); 554
553 tio->io = ci->io; 555 clone = split_bvec(bio, ci->sector, ci->idx,
554 tio->ti = ti; 556 bv->bv_offset + offset, len);
555 memset(&tio->info, 0, sizeof(tio->info)); 557
556 __map_bio(ti, clone, tio); 558 __map_bio(ti, clone, tio);
559
560 ci->sector += len;
561 ci->sector_count -= len;
562 offset += to_bytes(len);
563 } while (remaining -= len);
557 564
558 ci->sector += len;
559 ci->sector_count -= len;
560 ci->idx++; 565 ci->idx++;
561 } 566 }
562} 567}
diff --git a/drivers/media/dvb/bt8xx/Makefile b/drivers/media/dvb/bt8xx/Makefile
index 9d197efb481d..d188e4c670b5 100644
--- a/drivers/media/dvb/bt8xx/Makefile
+++ b/drivers/media/dvb/bt8xx/Makefile
@@ -1,3 +1,3 @@
1obj-$(CONFIG_DVB_BT8XX) += bt878.o dvb-bt8xx.o dst.o dst_ca.o 1obj-$(CONFIG_DVB_BT8XX) += bt878.o dvb-bt8xx.o dst.o dst_ca.o
2 2
3EXTRA_CFLAGS = -Idrivers/media/dvb/dvb-core/ -Idrivers/media/video/bt8xx -Idrivers/media/dvb/frontends 3EXTRA_CFLAGS = -Idrivers/media/dvb/dvb-core/ -Idrivers/media/video -Idrivers/media/dvb/frontends
diff --git a/drivers/net/mv643xx_eth.h b/drivers/net/mv643xx_eth.h
index 7754d1974b9e..4262c1da6d4a 100644
--- a/drivers/net/mv643xx_eth.h
+++ b/drivers/net/mv643xx_eth.h
@@ -42,13 +42,23 @@
42#define MAX_DESCS_PER_SKB 1 42#define MAX_DESCS_PER_SKB 1
43#endif 43#endif
44 44
45/*
46 * The MV643XX HW requires 8-byte alignment. However, when I/O
47 * is non-cache-coherent, we need to ensure that the I/O buffers
48 * we use don't share cache lines with other data.
49 */
50#if defined(CONFIG_DMA_NONCOHERENT) || defined(CONFIG_NOT_COHERENT_CACHE)
51#define ETH_DMA_ALIGN L1_CACHE_BYTES
52#else
53#define ETH_DMA_ALIGN 8
54#endif
55
45#define ETH_VLAN_HLEN 4 56#define ETH_VLAN_HLEN 4
46#define ETH_FCS_LEN 4 57#define ETH_FCS_LEN 4
47#define ETH_DMA_ALIGN 8 /* hw requires 8-byte alignment */ 58#define ETH_HW_IP_ALIGN 2 /* hw aligns IP header */
48#define ETH_HW_IP_ALIGN 2 /* hw aligns IP header */
49#define ETH_WRAPPER_LEN (ETH_HW_IP_ALIGN + ETH_HLEN + \ 59#define ETH_WRAPPER_LEN (ETH_HW_IP_ALIGN + ETH_HLEN + \
50 ETH_VLAN_HLEN + ETH_FCS_LEN) 60 ETH_VLAN_HLEN + ETH_FCS_LEN)
51#define ETH_RX_SKB_SIZE ((dev->mtu + ETH_WRAPPER_LEN + 7) & ~0x7) 61#define ETH_RX_SKB_SIZE (dev->mtu + ETH_WRAPPER_LEN + ETH_DMA_ALIGN)
52 62
53#define ETH_RX_QUEUES_ENABLED (1 << 0) /* use only Q0 for receive */ 63#define ETH_RX_QUEUES_ENABLED (1 << 0) /* use only Q0 for receive */
54#define ETH_TX_QUEUES_ENABLED (1 << 0) /* use only Q0 for transmit */ 64#define ETH_TX_QUEUES_ENABLED (1 << 0) /* use only Q0 for transmit */
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 7e900572eaf8..9595f74da93f 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -22,12 +22,12 @@
22 *************************************************************************/ 22 *************************************************************************/
23 23
24#define DRV_NAME "pcnet32" 24#define DRV_NAME "pcnet32"
25#define DRV_VERSION "1.31c" 25#define DRV_VERSION "1.32"
26#define DRV_RELDATE "01.Nov.2005" 26#define DRV_RELDATE "18.Mar.2006"
27#define PFX DRV_NAME ": " 27#define PFX DRV_NAME ": "
28 28
29static const char * const version = 29static const char *const version =
30DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n"; 30 DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n";
31 31
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/kernel.h> 33#include <linux/kernel.h>
@@ -58,18 +58,23 @@ DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n";
58 * PCI device identifiers for "new style" Linux PCI Device Drivers 58 * PCI device identifiers for "new style" Linux PCI Device Drivers
59 */ 59 */
60static struct pci_device_id pcnet32_pci_tbl[] = { 60static struct pci_device_id pcnet32_pci_tbl[] = {
61 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE_HOME, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, 61 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE_HOME,
62 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, 62 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
63 /* 63 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE,
64 * Adapters that were sold with IBM's RS/6000 or pSeries hardware have 64 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
65 * the incorrect vendor id. 65
66 */ 66 /*
67 { PCI_VENDOR_ID_TRIDENT, PCI_DEVICE_ID_AMD_LANCE, PCI_ANY_ID, PCI_ANY_ID, 67 * Adapters that were sold with IBM's RS/6000 or pSeries hardware have
68 PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0 }, 68 * the incorrect vendor id.
69 { 0, } 69 */
70 { PCI_VENDOR_ID_TRIDENT, PCI_DEVICE_ID_AMD_LANCE,
71 PCI_ANY_ID, PCI_ANY_ID,
72 PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0},
73
74 { } /* terminate list */
70}; 75};
71 76
72MODULE_DEVICE_TABLE (pci, pcnet32_pci_tbl); 77MODULE_DEVICE_TABLE(pci, pcnet32_pci_tbl);
73 78
74static int cards_found; 79static int cards_found;
75 80
@@ -77,13 +82,11 @@ static int cards_found;
77 * VLB I/O addresses 82 * VLB I/O addresses
78 */ 83 */
79static unsigned int pcnet32_portlist[] __initdata = 84static unsigned int pcnet32_portlist[] __initdata =
80 { 0x300, 0x320, 0x340, 0x360, 0 }; 85 { 0x300, 0x320, 0x340, 0x360, 0 };
81
82
83 86
84static int pcnet32_debug = 0; 87static int pcnet32_debug = 0;
85static int tx_start = 1; /* Mapping -- 0:20, 1:64, 2:128, 3:~220 (depends on chip vers) */ 88static int tx_start = 1; /* Mapping -- 0:20, 1:64, 2:128, 3:~220 (depends on chip vers) */
86static int pcnet32vlb; /* check for VLB cards ? */ 89static int pcnet32vlb; /* check for VLB cards ? */
87 90
88static struct net_device *pcnet32_dev; 91static struct net_device *pcnet32_dev;
89 92
@@ -110,32 +113,34 @@ static int rx_copybreak = 200;
110 * to internal options 113 * to internal options
111 */ 114 */
112static const unsigned char options_mapping[] = { 115static const unsigned char options_mapping[] = {
113 PCNET32_PORT_ASEL, /* 0 Auto-select */ 116 PCNET32_PORT_ASEL, /* 0 Auto-select */
114 PCNET32_PORT_AUI, /* 1 BNC/AUI */ 117 PCNET32_PORT_AUI, /* 1 BNC/AUI */
115 PCNET32_PORT_AUI, /* 2 AUI/BNC */ 118 PCNET32_PORT_AUI, /* 2 AUI/BNC */
116 PCNET32_PORT_ASEL, /* 3 not supported */ 119 PCNET32_PORT_ASEL, /* 3 not supported */
117 PCNET32_PORT_10BT | PCNET32_PORT_FD, /* 4 10baseT-FD */ 120 PCNET32_PORT_10BT | PCNET32_PORT_FD, /* 4 10baseT-FD */
118 PCNET32_PORT_ASEL, /* 5 not supported */ 121 PCNET32_PORT_ASEL, /* 5 not supported */
119 PCNET32_PORT_ASEL, /* 6 not supported */ 122 PCNET32_PORT_ASEL, /* 6 not supported */
120 PCNET32_PORT_ASEL, /* 7 not supported */ 123 PCNET32_PORT_ASEL, /* 7 not supported */
121 PCNET32_PORT_ASEL, /* 8 not supported */ 124 PCNET32_PORT_ASEL, /* 8 not supported */
122 PCNET32_PORT_MII, /* 9 MII 10baseT */ 125 PCNET32_PORT_MII, /* 9 MII 10baseT */
123 PCNET32_PORT_MII | PCNET32_PORT_FD, /* 10 MII 10baseT-FD */ 126 PCNET32_PORT_MII | PCNET32_PORT_FD, /* 10 MII 10baseT-FD */
124 PCNET32_PORT_MII, /* 11 MII (autosel) */ 127 PCNET32_PORT_MII, /* 11 MII (autosel) */
125 PCNET32_PORT_10BT, /* 12 10BaseT */ 128 PCNET32_PORT_10BT, /* 12 10BaseT */
126 PCNET32_PORT_MII | PCNET32_PORT_100, /* 13 MII 100BaseTx */ 129 PCNET32_PORT_MII | PCNET32_PORT_100, /* 13 MII 100BaseTx */
127 PCNET32_PORT_MII | PCNET32_PORT_100 | PCNET32_PORT_FD, /* 14 MII 100BaseTx-FD */ 130 /* 14 MII 100BaseTx-FD */
128 PCNET32_PORT_ASEL /* 15 not supported */ 131 PCNET32_PORT_MII | PCNET32_PORT_100 | PCNET32_PORT_FD,
132 PCNET32_PORT_ASEL /* 15 not supported */
129}; 133};
130 134
131static const char pcnet32_gstrings_test[][ETH_GSTRING_LEN] = { 135static const char pcnet32_gstrings_test[][ETH_GSTRING_LEN] = {
132 "Loopback test (offline)" 136 "Loopback test (offline)"
133}; 137};
138
134#define PCNET32_TEST_LEN (sizeof(pcnet32_gstrings_test) / ETH_GSTRING_LEN) 139#define PCNET32_TEST_LEN (sizeof(pcnet32_gstrings_test) / ETH_GSTRING_LEN)
135 140
136#define PCNET32_NUM_REGS 168 141#define PCNET32_NUM_REGS 136
137 142
138#define MAX_UNITS 8 /* More are supported, limit only on options */ 143#define MAX_UNITS 8 /* More are supported, limit only on options */
139static int options[MAX_UNITS]; 144static int options[MAX_UNITS];
140static int full_duplex[MAX_UNITS]; 145static int full_duplex[MAX_UNITS];
141static int homepna[MAX_UNITS]; 146static int homepna[MAX_UNITS];
@@ -151,124 +156,6 @@ static int homepna[MAX_UNITS];
151 */ 156 */
152 157
153/* 158/*
154 * History:
155 * v0.01: Initial version
156 * only tested on Alpha Noname Board
157 * v0.02: changed IRQ handling for new interrupt scheme (dev_id)
158 * tested on a ASUS SP3G
159 * v0.10: fixed an odd problem with the 79C974 in a Compaq Deskpro XL
160 * looks like the 974 doesn't like stopping and restarting in a
161 * short period of time; now we do a reinit of the lance; the
162 * bug was triggered by doing ifconfig eth0 <ip> broadcast <addr>
163 * and hangs the machine (thanks to Klaus Liedl for debugging)
164 * v0.12: by suggestion from Donald Becker: Renamed driver to pcnet32,
165 * made it standalone (no need for lance.c)
166 * v0.13: added additional PCI detecting for special PCI devices (Compaq)
167 * v0.14: stripped down additional PCI probe (thanks to David C Niemi
168 * and sveneric@xs4all.nl for testing this on their Compaq boxes)
169 * v0.15: added 79C965 (VLB) probe
170 * added interrupt sharing for PCI chips
171 * v0.16: fixed set_multicast_list on Alpha machines
172 * v0.17: removed hack from dev.c; now pcnet32 uses ethif_probe in Space.c
173 * v0.19: changed setting of autoselect bit
174 * v0.20: removed additional Compaq PCI probe; there is now a working one
175 * in arch/i386/bios32.c
176 * v0.21: added endian conversion for ppc, from work by cort@cs.nmt.edu
177 * v0.22: added printing of status to ring dump
178 * v0.23: changed enet_statistics to net_devive_stats
179 * v0.90: added multicast filter
180 * added module support
181 * changed irq probe to new style
182 * added PCnetFast chip id
183 * added fix for receive stalls with Intel saturn chipsets
184 * added in-place rx skbs like in the tulip driver
185 * minor cleanups
186 * v0.91: added PCnetFast+ chip id
187 * back port to 2.0.x
188 * v1.00: added some stuff from Donald Becker's 2.0.34 version
189 * added support for byte counters in net_dev_stats
190 * v1.01: do ring dumps, only when debugging the driver
191 * increased the transmit timeout
192 * v1.02: fixed memory leak in pcnet32_init_ring()
193 * v1.10: workaround for stopped transmitter
194 * added port selection for modules
195 * detect special T1/E1 WAN card and setup port selection
196 * v1.11: fixed wrong checking of Tx errors
197 * v1.20: added check of return value kmalloc (cpeterso@cs.washington.edu)
198 * added save original kmalloc addr for freeing (mcr@solidum.com)
199 * added support for PCnetHome chip (joe@MIT.EDU)
200 * rewritten PCI card detection
201 * added dwio mode to get driver working on some PPC machines
202 * v1.21: added mii selection and mii ioctl
203 * v1.22: changed pci scanning code to make PPC people happy
204 * fixed switching to 32bit mode in pcnet32_open() (thanks
205 * to Michael Richard <mcr@solidum.com> for noticing this one)
206 * added sub vendor/device id matching (thanks again to
207 * Michael Richard <mcr@solidum.com>)
208 * added chip id for 79c973/975 (thanks to Zach Brown <zab@zabbo.net>)
209 * v1.23 fixed small bug, when manual selecting MII speed/duplex
210 * v1.24 Applied Thomas' patch to use TxStartPoint and thus decrease TxFIFO
211 * underflows. Added tx_start_pt module parameter. Increased
212 * TX_RING_SIZE from 16 to 32. Added #ifdef'd code to use DXSUFLO
213 * for FAST[+] chipsets. <kaf@fc.hp.com>
214 * v1.24ac Added SMP spinlocking - Alan Cox <alan@redhat.com>
215 * v1.25kf Added No Interrupt on successful Tx for some Tx's <kaf@fc.hp.com>
216 * v1.26 Converted to pci_alloc_consistent, Jamey Hicks / George France
217 * <jamey@crl.dec.com>
218 * - Fixed a few bugs, related to running the controller in 32bit mode.
219 * 23 Oct, 2000. Carsten Langgaard, carstenl@mips.com
220 * Copyright (C) 2000 MIPS Technologies, Inc. All rights reserved.
221 * v1.26p Fix oops on rmmod+insmod; plug i/o resource leak - Paul Gortmaker
222 * v1.27 improved CSR/PROM address detection, lots of cleanups,
223 * new pcnet32vlb module option, HP-PARISC support,
224 * added module parameter descriptions,
225 * initial ethtool support - Helge Deller <deller@gmx.de>
226 * v1.27a Sun Feb 10 2002 Go Taniguchi <go@turbolinux.co.jp>
227 * use alloc_etherdev and register_netdev
228 * fix pci probe not increment cards_found
229 * FD auto negotiate error workaround for xSeries250
230 * clean up and using new mii module
231 * v1.27b Sep 30 2002 Kent Yoder <yoder1@us.ibm.com>
232 * Added timer for cable connection state changes.
233 * v1.28 20 Feb 2004 Don Fry <brazilnut@us.ibm.com>
234 * Jon Mason <jonmason@us.ibm.com>, Chinmay Albal <albal@in.ibm.com>
235 * Now uses ethtool_ops, netif_msg_* and generic_mii_ioctl.
236 * Fixes bogus 'Bus master arbitration failure', pci_[un]map_single
237 * length errors, and transmit hangs. Cleans up after errors in open.
238 * Jim Lewis <jklewis@us.ibm.com> added ethernet loopback test.
239 * Thomas Munck Steenholdt <tmus@tmus.dk> non-mii ioctl corrections.
240 * v1.29 6 Apr 2004 Jim Lewis <jklewis@us.ibm.com> added physical
241 * identification code (blink led's) and register dump.
242 * Don Fry added timer for 971/972 so skbufs don't remain on tx ring
243 * forever.
244 * v1.30 18 May 2004 Don Fry removed timer and Last Transmit Interrupt
245 * (ltint) as they added complexity and didn't give good throughput.
246 * v1.30a 22 May 2004 Don Fry limit frames received during interrupt.
247 * v1.30b 24 May 2004 Don Fry fix bogus tx carrier errors with 79c973,
248 * assisted by Bruce Penrod <bmpenrod@endruntechnologies.com>.
249 * v1.30c 25 May 2004 Don Fry added netif_wake_queue after pcnet32_restart.
250 * v1.30d 01 Jun 2004 Don Fry discard oversize rx packets.
251 * v1.30e 11 Jun 2004 Don Fry recover after fifo error and rx hang.
252 * v1.30f 16 Jun 2004 Don Fry cleanup IRQ to allow 0 and 1 for PCI,
253 * expanding on suggestions from Ralf Baechle <ralf@linux-mips.org>,
254 * and Brian Murphy <brian@murphy.dk>.
255 * v1.30g 22 Jun 2004 Patrick Simmons <psimmons@flash.net> added option
256 * homepna for selecting HomePNA mode for PCNet/Home 79C978.
257 * v1.30h 24 Jun 2004 Don Fry correctly select auto, speed, duplex in bcr32.
258 * v1.30i 28 Jun 2004 Don Fry change to use module_param.
259 * v1.30j 29 Apr 2005 Don Fry fix skb/map leak with loopback test.
260 * v1.31 02 Sep 2005 Hubert WS Lin <wslin@tw.ibm.c0m> added set_ringparam().
261 * v1.31a 12 Sep 2005 Hubert WS Lin <wslin@tw.ibm.c0m> set min ring size to 4
262 * to allow loopback test to work unchanged.
263 * v1.31b 06 Oct 2005 Don Fry changed alloc_ring to show name of device
264 * if allocation fails
265 * v1.31c 01 Nov 2005 Don Fry Allied Telesyn 2700/2701 FX are 100Mbit only.
266 * Force 100Mbit FD if Auto (ASEL) is selected.
267 * See Bugzilla 2669 and 4551.
268 */
269
270
271/*
272 * Set the number of Tx and Rx buffers, using Log_2(# buffers). 159 * Set the number of Tx and Rx buffers, using Log_2(# buffers).
273 * Reasonable default values are 4 Tx buffers, and 16 Rx buffers. 160 * Reasonable default values are 4 Tx buffers, and 16 Rx buffers.
274 * That translates to 2 (4 == 2^^2) and 4 (16 == 2^^4). 161 * That translates to 2 (4 == 2^^2) and 4 (16 == 2^^4).
@@ -303,42 +190,42 @@ static int homepna[MAX_UNITS];
303 190
304/* The PCNET32 Rx and Tx ring descriptors. */ 191/* The PCNET32 Rx and Tx ring descriptors. */
305struct pcnet32_rx_head { 192struct pcnet32_rx_head {
306 u32 base; 193 u32 base;
307 s16 buf_length; 194 s16 buf_length;
308 s16 status; 195 s16 status;
309 u32 msg_length; 196 u32 msg_length;
310 u32 reserved; 197 u32 reserved;
311}; 198};
312 199
313struct pcnet32_tx_head { 200struct pcnet32_tx_head {
314 u32 base; 201 u32 base;
315 s16 length; 202 s16 length;
316 s16 status; 203 s16 status;
317 u32 misc; 204 u32 misc;
318 u32 reserved; 205 u32 reserved;
319}; 206};
320 207
321/* The PCNET32 32-Bit initialization block, described in databook. */ 208/* The PCNET32 32-Bit initialization block, described in databook. */
322struct pcnet32_init_block { 209struct pcnet32_init_block {
323 u16 mode; 210 u16 mode;
324 u16 tlen_rlen; 211 u16 tlen_rlen;
325 u8 phys_addr[6]; 212 u8 phys_addr[6];
326 u16 reserved; 213 u16 reserved;
327 u32 filter[2]; 214 u32 filter[2];
328 /* Receive and transmit ring base, along with extra bits. */ 215 /* Receive and transmit ring base, along with extra bits. */
329 u32 rx_ring; 216 u32 rx_ring;
330 u32 tx_ring; 217 u32 tx_ring;
331}; 218};
332 219
333/* PCnet32 access functions */ 220/* PCnet32 access functions */
334struct pcnet32_access { 221struct pcnet32_access {
335 u16 (*read_csr)(unsigned long, int); 222 u16 (*read_csr) (unsigned long, int);
336 void (*write_csr)(unsigned long, int, u16); 223 void (*write_csr) (unsigned long, int, u16);
337 u16 (*read_bcr)(unsigned long, int); 224 u16 (*read_bcr) (unsigned long, int);
338 void (*write_bcr)(unsigned long, int, u16); 225 void (*write_bcr) (unsigned long, int, u16);
339 u16 (*read_rap)(unsigned long); 226 u16 (*read_rap) (unsigned long);
340 void (*write_rap)(unsigned long, u16); 227 void (*write_rap) (unsigned long, u16);
341 void (*reset)(unsigned long); 228 void (*reset) (unsigned long);
342}; 229};
343 230
344/* 231/*
@@ -346,760 +233,794 @@ struct pcnet32_access {
346 * so the structure should be allocated using pci_alloc_consistent(). 233 * so the structure should be allocated using pci_alloc_consistent().
347 */ 234 */
348struct pcnet32_private { 235struct pcnet32_private {
349 struct pcnet32_init_block init_block; 236 struct pcnet32_init_block init_block;
350 /* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */ 237 /* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */
351 struct pcnet32_rx_head *rx_ring; 238 struct pcnet32_rx_head *rx_ring;
352 struct pcnet32_tx_head *tx_ring; 239 struct pcnet32_tx_head *tx_ring;
353 dma_addr_t dma_addr; /* DMA address of beginning of this 240 dma_addr_t dma_addr;/* DMA address of beginning of this
354 object, returned by 241 object, returned by pci_alloc_consistent */
355 pci_alloc_consistent */ 242 struct pci_dev *pci_dev;
356 struct pci_dev *pci_dev; /* Pointer to the associated pci device 243 const char *name;
357 structure */ 244 /* The saved address of a sent-in-place packet/buffer, for skfree(). */
358 const char *name; 245 struct sk_buff **tx_skbuff;
359 /* The saved address of a sent-in-place packet/buffer, for skfree(). */ 246 struct sk_buff **rx_skbuff;
360 struct sk_buff **tx_skbuff; 247 dma_addr_t *tx_dma_addr;
361 struct sk_buff **rx_skbuff; 248 dma_addr_t *rx_dma_addr;
362 dma_addr_t *tx_dma_addr; 249 struct pcnet32_access a;
363 dma_addr_t *rx_dma_addr; 250 spinlock_t lock; /* Guard lock */
364 struct pcnet32_access a; 251 unsigned int cur_rx, cur_tx; /* The next free ring entry */
365 spinlock_t lock; /* Guard lock */ 252 unsigned int rx_ring_size; /* current rx ring size */
366 unsigned int cur_rx, cur_tx; /* The next free ring entry */ 253 unsigned int tx_ring_size; /* current tx ring size */
367 unsigned int rx_ring_size; /* current rx ring size */ 254 unsigned int rx_mod_mask; /* rx ring modular mask */
368 unsigned int tx_ring_size; /* current tx ring size */ 255 unsigned int tx_mod_mask; /* tx ring modular mask */
369 unsigned int rx_mod_mask; /* rx ring modular mask */ 256 unsigned short rx_len_bits;
370 unsigned int tx_mod_mask; /* tx ring modular mask */ 257 unsigned short tx_len_bits;
371 unsigned short rx_len_bits; 258 dma_addr_t rx_ring_dma_addr;
372 unsigned short tx_len_bits; 259 dma_addr_t tx_ring_dma_addr;
373 dma_addr_t rx_ring_dma_addr; 260 unsigned int dirty_rx, /* ring entries to be freed. */
374 dma_addr_t tx_ring_dma_addr; 261 dirty_tx;
375 unsigned int dirty_rx, dirty_tx; /* The ring entries to be free()ed. */ 262
376 struct net_device_stats stats; 263 struct net_device_stats stats;
377 char tx_full; 264 char tx_full;
378 int options; 265 char phycount; /* number of phys found */
379 unsigned int shared_irq:1, /* shared irq possible */ 266 int options;
380 dxsuflo:1, /* disable transmit stop on uflo */ 267 unsigned int shared_irq:1, /* shared irq possible */
381 mii:1; /* mii port available */ 268 dxsuflo:1, /* disable transmit stop on uflo */
382 struct net_device *next; 269 mii:1; /* mii port available */
383 struct mii_if_info mii_if; 270 struct net_device *next;
384 struct timer_list watchdog_timer; 271 struct mii_if_info mii_if;
385 struct timer_list blink_timer; 272 struct timer_list watchdog_timer;
386 u32 msg_enable; /* debug message level */ 273 struct timer_list blink_timer;
274 u32 msg_enable; /* debug message level */
275
276 /* each bit indicates an available PHY */
277 u32 phymask;
387}; 278};
388 279
389static void pcnet32_probe_vlbus(void); 280static void pcnet32_probe_vlbus(void);
390static int pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *); 281static int pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *);
391static int pcnet32_probe1(unsigned long, int, struct pci_dev *); 282static int pcnet32_probe1(unsigned long, int, struct pci_dev *);
392static int pcnet32_open(struct net_device *); 283static int pcnet32_open(struct net_device *);
393static int pcnet32_init_ring(struct net_device *); 284static int pcnet32_init_ring(struct net_device *);
394static int pcnet32_start_xmit(struct sk_buff *, struct net_device *); 285static int pcnet32_start_xmit(struct sk_buff *, struct net_device *);
395static int pcnet32_rx(struct net_device *); 286static int pcnet32_rx(struct net_device *);
396static void pcnet32_tx_timeout (struct net_device *dev); 287static void pcnet32_tx_timeout(struct net_device *dev);
397static irqreturn_t pcnet32_interrupt(int, void *, struct pt_regs *); 288static irqreturn_t pcnet32_interrupt(int, void *, struct pt_regs *);
398static int pcnet32_close(struct net_device *); 289static int pcnet32_close(struct net_device *);
399static struct net_device_stats *pcnet32_get_stats(struct net_device *); 290static struct net_device_stats *pcnet32_get_stats(struct net_device *);
400static void pcnet32_load_multicast(struct net_device *dev); 291static void pcnet32_load_multicast(struct net_device *dev);
401static void pcnet32_set_multicast_list(struct net_device *); 292static void pcnet32_set_multicast_list(struct net_device *);
402static int pcnet32_ioctl(struct net_device *, struct ifreq *, int); 293static int pcnet32_ioctl(struct net_device *, struct ifreq *, int);
403static void pcnet32_watchdog(struct net_device *); 294static void pcnet32_watchdog(struct net_device *);
404static int mdio_read(struct net_device *dev, int phy_id, int reg_num); 295static int mdio_read(struct net_device *dev, int phy_id, int reg_num);
405static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val); 296static void mdio_write(struct net_device *dev, int phy_id, int reg_num,
297 int val);
406static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits); 298static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits);
407static void pcnet32_ethtool_test(struct net_device *dev, 299static void pcnet32_ethtool_test(struct net_device *dev,
408 struct ethtool_test *eth_test, u64 *data); 300 struct ethtool_test *eth_test, u64 * data);
409static int pcnet32_loopback_test(struct net_device *dev, uint64_t *data1); 301static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1);
410static int pcnet32_phys_id(struct net_device *dev, u32 data); 302static int pcnet32_phys_id(struct net_device *dev, u32 data);
411static void pcnet32_led_blink_callback(struct net_device *dev); 303static void pcnet32_led_blink_callback(struct net_device *dev);
412static int pcnet32_get_regs_len(struct net_device *dev); 304static int pcnet32_get_regs_len(struct net_device *dev);
413static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs, 305static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs,
414 void *ptr); 306 void *ptr);
415static void pcnet32_purge_tx_ring(struct net_device *dev); 307static void pcnet32_purge_tx_ring(struct net_device *dev);
416static int pcnet32_alloc_ring(struct net_device *dev, char *name); 308static int pcnet32_alloc_ring(struct net_device *dev, char *name);
417static void pcnet32_free_ring(struct net_device *dev); 309static void pcnet32_free_ring(struct net_device *dev);
418 310static void pcnet32_check_media(struct net_device *dev, int verbose);
419 311
420enum pci_flags_bit { 312enum pci_flags_bit {
421 PCI_USES_IO=1, PCI_USES_MEM=2, PCI_USES_MASTER=4, 313 PCI_USES_IO = 1, PCI_USES_MEM = 2, PCI_USES_MASTER = 4,
422 PCI_ADDR0=0x10<<0, PCI_ADDR1=0x10<<1, PCI_ADDR2=0x10<<2, PCI_ADDR3=0x10<<3, 314 PCI_ADDR0 = 0x10 << 0, PCI_ADDR1 = 0x10 << 1, PCI_ADDR2 =
315 0x10 << 2, PCI_ADDR3 = 0x10 << 3,
423}; 316};
424 317
425 318static u16 pcnet32_wio_read_csr(unsigned long addr, int index)
426static u16 pcnet32_wio_read_csr (unsigned long addr, int index)
427{ 319{
428 outw (index, addr+PCNET32_WIO_RAP); 320 outw(index, addr + PCNET32_WIO_RAP);
429 return inw (addr+PCNET32_WIO_RDP); 321 return inw(addr + PCNET32_WIO_RDP);
430} 322}
431 323
432static void pcnet32_wio_write_csr (unsigned long addr, int index, u16 val) 324static void pcnet32_wio_write_csr(unsigned long addr, int index, u16 val)
433{ 325{
434 outw (index, addr+PCNET32_WIO_RAP); 326 outw(index, addr + PCNET32_WIO_RAP);
435 outw (val, addr+PCNET32_WIO_RDP); 327 outw(val, addr + PCNET32_WIO_RDP);
436} 328}
437 329
438static u16 pcnet32_wio_read_bcr (unsigned long addr, int index) 330static u16 pcnet32_wio_read_bcr(unsigned long addr, int index)
439{ 331{
440 outw (index, addr+PCNET32_WIO_RAP); 332 outw(index, addr + PCNET32_WIO_RAP);
441 return inw (addr+PCNET32_WIO_BDP); 333 return inw(addr + PCNET32_WIO_BDP);
442} 334}
443 335
444static void pcnet32_wio_write_bcr (unsigned long addr, int index, u16 val) 336static void pcnet32_wio_write_bcr(unsigned long addr, int index, u16 val)
445{ 337{
446 outw (index, addr+PCNET32_WIO_RAP); 338 outw(index, addr + PCNET32_WIO_RAP);
447 outw (val, addr+PCNET32_WIO_BDP); 339 outw(val, addr + PCNET32_WIO_BDP);
448} 340}
449 341
450static u16 pcnet32_wio_read_rap (unsigned long addr) 342static u16 pcnet32_wio_read_rap(unsigned long addr)
451{ 343{
452 return inw (addr+PCNET32_WIO_RAP); 344 return inw(addr + PCNET32_WIO_RAP);
453} 345}
454 346
455static void pcnet32_wio_write_rap (unsigned long addr, u16 val) 347static void pcnet32_wio_write_rap(unsigned long addr, u16 val)
456{ 348{
457 outw (val, addr+PCNET32_WIO_RAP); 349 outw(val, addr + PCNET32_WIO_RAP);
458} 350}
459 351
460static void pcnet32_wio_reset (unsigned long addr) 352static void pcnet32_wio_reset(unsigned long addr)
461{ 353{
462 inw (addr+PCNET32_WIO_RESET); 354 inw(addr + PCNET32_WIO_RESET);
463} 355}
464 356
465static int pcnet32_wio_check (unsigned long addr) 357static int pcnet32_wio_check(unsigned long addr)
466{ 358{
467 outw (88, addr+PCNET32_WIO_RAP); 359 outw(88, addr + PCNET32_WIO_RAP);
468 return (inw (addr+PCNET32_WIO_RAP) == 88); 360 return (inw(addr + PCNET32_WIO_RAP) == 88);
469} 361}
470 362
471static struct pcnet32_access pcnet32_wio = { 363static struct pcnet32_access pcnet32_wio = {
472 .read_csr = pcnet32_wio_read_csr, 364 .read_csr = pcnet32_wio_read_csr,
473 .write_csr = pcnet32_wio_write_csr, 365 .write_csr = pcnet32_wio_write_csr,
474 .read_bcr = pcnet32_wio_read_bcr, 366 .read_bcr = pcnet32_wio_read_bcr,
475 .write_bcr = pcnet32_wio_write_bcr, 367 .write_bcr = pcnet32_wio_write_bcr,
476 .read_rap = pcnet32_wio_read_rap, 368 .read_rap = pcnet32_wio_read_rap,
477 .write_rap = pcnet32_wio_write_rap, 369 .write_rap = pcnet32_wio_write_rap,
478 .reset = pcnet32_wio_reset 370 .reset = pcnet32_wio_reset
479}; 371};
480 372
481static u16 pcnet32_dwio_read_csr (unsigned long addr, int index) 373static u16 pcnet32_dwio_read_csr(unsigned long addr, int index)
482{ 374{
483 outl (index, addr+PCNET32_DWIO_RAP); 375 outl(index, addr + PCNET32_DWIO_RAP);
484 return (inl (addr+PCNET32_DWIO_RDP) & 0xffff); 376 return (inl(addr + PCNET32_DWIO_RDP) & 0xffff);
485} 377}
486 378
487static void pcnet32_dwio_write_csr (unsigned long addr, int index, u16 val) 379static void pcnet32_dwio_write_csr(unsigned long addr, int index, u16 val)
488{ 380{
489 outl (index, addr+PCNET32_DWIO_RAP); 381 outl(index, addr + PCNET32_DWIO_RAP);
490 outl (val, addr+PCNET32_DWIO_RDP); 382 outl(val, addr + PCNET32_DWIO_RDP);
491} 383}
492 384
493static u16 pcnet32_dwio_read_bcr (unsigned long addr, int index) 385static u16 pcnet32_dwio_read_bcr(unsigned long addr, int index)
494{ 386{
495 outl (index, addr+PCNET32_DWIO_RAP); 387 outl(index, addr + PCNET32_DWIO_RAP);
496 return (inl (addr+PCNET32_DWIO_BDP) & 0xffff); 388 return (inl(addr + PCNET32_DWIO_BDP) & 0xffff);
497} 389}
498 390
499static void pcnet32_dwio_write_bcr (unsigned long addr, int index, u16 val) 391static void pcnet32_dwio_write_bcr(unsigned long addr, int index, u16 val)
500{ 392{
501 outl (index, addr+PCNET32_DWIO_RAP); 393 outl(index, addr + PCNET32_DWIO_RAP);
502 outl (val, addr+PCNET32_DWIO_BDP); 394 outl(val, addr + PCNET32_DWIO_BDP);
503} 395}
504 396
505static u16 pcnet32_dwio_read_rap (unsigned long addr) 397static u16 pcnet32_dwio_read_rap(unsigned long addr)
506{ 398{
507 return (inl (addr+PCNET32_DWIO_RAP) & 0xffff); 399 return (inl(addr + PCNET32_DWIO_RAP) & 0xffff);
508} 400}
509 401
510static void pcnet32_dwio_write_rap (unsigned long addr, u16 val) 402static void pcnet32_dwio_write_rap(unsigned long addr, u16 val)
511{ 403{
512 outl (val, addr+PCNET32_DWIO_RAP); 404 outl(val, addr + PCNET32_DWIO_RAP);
513} 405}
514 406
515static void pcnet32_dwio_reset (unsigned long addr) 407static void pcnet32_dwio_reset(unsigned long addr)
516{ 408{
517 inl (addr+PCNET32_DWIO_RESET); 409 inl(addr + PCNET32_DWIO_RESET);
518} 410}
519 411
520static int pcnet32_dwio_check (unsigned long addr) 412static int pcnet32_dwio_check(unsigned long addr)
521{ 413{
522 outl (88, addr+PCNET32_DWIO_RAP); 414 outl(88, addr + PCNET32_DWIO_RAP);
523 return ((inl (addr+PCNET32_DWIO_RAP) & 0xffff) == 88); 415 return ((inl(addr + PCNET32_DWIO_RAP) & 0xffff) == 88);
524} 416}
525 417
526static struct pcnet32_access pcnet32_dwio = { 418static struct pcnet32_access pcnet32_dwio = {
527 .read_csr = pcnet32_dwio_read_csr, 419 .read_csr = pcnet32_dwio_read_csr,
528 .write_csr = pcnet32_dwio_write_csr, 420 .write_csr = pcnet32_dwio_write_csr,
529 .read_bcr = pcnet32_dwio_read_bcr, 421 .read_bcr = pcnet32_dwio_read_bcr,
530 .write_bcr = pcnet32_dwio_write_bcr, 422 .write_bcr = pcnet32_dwio_write_bcr,
531 .read_rap = pcnet32_dwio_read_rap, 423 .read_rap = pcnet32_dwio_read_rap,
532 .write_rap = pcnet32_dwio_write_rap, 424 .write_rap = pcnet32_dwio_write_rap,
533 .reset = pcnet32_dwio_reset 425 .reset = pcnet32_dwio_reset
534}; 426};
535 427
536#ifdef CONFIG_NET_POLL_CONTROLLER 428#ifdef CONFIG_NET_POLL_CONTROLLER
537static void pcnet32_poll_controller(struct net_device *dev) 429static void pcnet32_poll_controller(struct net_device *dev)
538{ 430{
539 disable_irq(dev->irq); 431 disable_irq(dev->irq);
540 pcnet32_interrupt(0, dev, NULL); 432 pcnet32_interrupt(0, dev, NULL);
541 enable_irq(dev->irq); 433 enable_irq(dev->irq);
542} 434}
543#endif 435#endif
544 436
545
546static int pcnet32_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) 437static int pcnet32_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
547{ 438{
548 struct pcnet32_private *lp = dev->priv; 439 struct pcnet32_private *lp = dev->priv;
549 unsigned long flags; 440 unsigned long flags;
550 int r = -EOPNOTSUPP; 441 int r = -EOPNOTSUPP;
551 442
552 if (lp->mii) { 443 if (lp->mii) {
553 spin_lock_irqsave(&lp->lock, flags); 444 spin_lock_irqsave(&lp->lock, flags);
554 mii_ethtool_gset(&lp->mii_if, cmd); 445 mii_ethtool_gset(&lp->mii_if, cmd);
555 spin_unlock_irqrestore(&lp->lock, flags); 446 spin_unlock_irqrestore(&lp->lock, flags);
556 r = 0; 447 r = 0;
557 } 448 }
558 return r; 449 return r;
559} 450}
560 451
561static int pcnet32_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) 452static int pcnet32_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
562{ 453{
563 struct pcnet32_private *lp = dev->priv; 454 struct pcnet32_private *lp = dev->priv;
564 unsigned long flags; 455 unsigned long flags;
565 int r = -EOPNOTSUPP; 456 int r = -EOPNOTSUPP;
566 457
567 if (lp->mii) { 458 if (lp->mii) {
568 spin_lock_irqsave(&lp->lock, flags); 459 spin_lock_irqsave(&lp->lock, flags);
569 r = mii_ethtool_sset(&lp->mii_if, cmd); 460 r = mii_ethtool_sset(&lp->mii_if, cmd);
570 spin_unlock_irqrestore(&lp->lock, flags); 461 spin_unlock_irqrestore(&lp->lock, flags);
571 } 462 }
572 return r; 463 return r;
573} 464}
574 465
575static void pcnet32_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 466static void pcnet32_get_drvinfo(struct net_device *dev,
467 struct ethtool_drvinfo *info)
576{ 468{
577 struct pcnet32_private *lp = dev->priv; 469 struct pcnet32_private *lp = dev->priv;
578 470
579 strcpy (info->driver, DRV_NAME); 471 strcpy(info->driver, DRV_NAME);
580 strcpy (info->version, DRV_VERSION); 472 strcpy(info->version, DRV_VERSION);
581 if (lp->pci_dev) 473 if (lp->pci_dev)
582 strcpy (info->bus_info, pci_name(lp->pci_dev)); 474 strcpy(info->bus_info, pci_name(lp->pci_dev));
583 else 475 else
584 sprintf(info->bus_info, "VLB 0x%lx", dev->base_addr); 476 sprintf(info->bus_info, "VLB 0x%lx", dev->base_addr);
585} 477}
586 478
587static u32 pcnet32_get_link(struct net_device *dev) 479static u32 pcnet32_get_link(struct net_device *dev)
588{ 480{
589 struct pcnet32_private *lp = dev->priv; 481 struct pcnet32_private *lp = dev->priv;
590 unsigned long flags; 482 unsigned long flags;
591 int r; 483 int r;
592
593 spin_lock_irqsave(&lp->lock, flags);
594 if (lp->mii) {
595 r = mii_link_ok(&lp->mii_if);
596 } else {
597 ulong ioaddr = dev->base_addr; /* card base I/O address */
598 r = (lp->a.read_bcr(ioaddr, 4) != 0xc0);
599 }
600 spin_unlock_irqrestore(&lp->lock, flags);
601 484
602 return r; 485 spin_lock_irqsave(&lp->lock, flags);
486 if (lp->mii) {
487 r = mii_link_ok(&lp->mii_if);
488 } else {
489 ulong ioaddr = dev->base_addr; /* card base I/O address */
490 r = (lp->a.read_bcr(ioaddr, 4) != 0xc0);
491 }
492 spin_unlock_irqrestore(&lp->lock, flags);
493
494 return r;
603} 495}
604 496
605static u32 pcnet32_get_msglevel(struct net_device *dev) 497static u32 pcnet32_get_msglevel(struct net_device *dev)
606{ 498{
607 struct pcnet32_private *lp = dev->priv; 499 struct pcnet32_private *lp = dev->priv;
608 return lp->msg_enable; 500 return lp->msg_enable;
609} 501}
610 502
611static void pcnet32_set_msglevel(struct net_device *dev, u32 value) 503static void pcnet32_set_msglevel(struct net_device *dev, u32 value)
612{ 504{
613 struct pcnet32_private *lp = dev->priv; 505 struct pcnet32_private *lp = dev->priv;
614 lp->msg_enable = value; 506 lp->msg_enable = value;
615} 507}
616 508
617static int pcnet32_nway_reset(struct net_device *dev) 509static int pcnet32_nway_reset(struct net_device *dev)
618{ 510{
619 struct pcnet32_private *lp = dev->priv; 511 struct pcnet32_private *lp = dev->priv;
620 unsigned long flags; 512 unsigned long flags;
621 int r = -EOPNOTSUPP; 513 int r = -EOPNOTSUPP;
622 514
623 if (lp->mii) { 515 if (lp->mii) {
624 spin_lock_irqsave(&lp->lock, flags); 516 spin_lock_irqsave(&lp->lock, flags);
625 r = mii_nway_restart(&lp->mii_if); 517 r = mii_nway_restart(&lp->mii_if);
626 spin_unlock_irqrestore(&lp->lock, flags); 518 spin_unlock_irqrestore(&lp->lock, flags);
627 } 519 }
628 return r; 520 return r;
629} 521}
630 522
631static void pcnet32_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) 523static void pcnet32_get_ringparam(struct net_device *dev,
524 struct ethtool_ringparam *ering)
632{ 525{
633 struct pcnet32_private *lp = dev->priv; 526 struct pcnet32_private *lp = dev->priv;
634 527
635 ering->tx_max_pending = TX_MAX_RING_SIZE - 1; 528 ering->tx_max_pending = TX_MAX_RING_SIZE - 1;
636 ering->tx_pending = lp->tx_ring_size - 1; 529 ering->tx_pending = lp->tx_ring_size - 1;
637 ering->rx_max_pending = RX_MAX_RING_SIZE - 1; 530 ering->rx_max_pending = RX_MAX_RING_SIZE - 1;
638 ering->rx_pending = lp->rx_ring_size - 1; 531 ering->rx_pending = lp->rx_ring_size - 1;
639} 532}
640 533
641static int pcnet32_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) 534static int pcnet32_set_ringparam(struct net_device *dev,
535 struct ethtool_ringparam *ering)
642{ 536{
643 struct pcnet32_private *lp = dev->priv; 537 struct pcnet32_private *lp = dev->priv;
644 unsigned long flags; 538 unsigned long flags;
645 int i; 539 int i;
646 540
647 if (ering->rx_mini_pending || ering->rx_jumbo_pending) 541 if (ering->rx_mini_pending || ering->rx_jumbo_pending)
648 return -EINVAL; 542 return -EINVAL;
649 543
650 if (netif_running(dev)) 544 if (netif_running(dev))
651 pcnet32_close(dev); 545 pcnet32_close(dev);
652 546
653 spin_lock_irqsave(&lp->lock, flags); 547 spin_lock_irqsave(&lp->lock, flags);
654 pcnet32_free_ring(dev);
655 lp->tx_ring_size = min(ering->tx_pending, (unsigned int) TX_MAX_RING_SIZE);
656 lp->rx_ring_size = min(ering->rx_pending, (unsigned int) RX_MAX_RING_SIZE);
657
658 /* set the minimum ring size to 4, to allow the loopback test to work
659 * unchanged.
660 */
661 for (i = 2; i <= PCNET32_LOG_MAX_TX_BUFFERS; i++) {
662 if (lp->tx_ring_size <= (1 << i))
663 break;
664 }
665 lp->tx_ring_size = (1 << i);
666 lp->tx_mod_mask = lp->tx_ring_size - 1;
667 lp->tx_len_bits = (i << 12);
668
669 for (i = 2; i <= PCNET32_LOG_MAX_RX_BUFFERS; i++) {
670 if (lp->rx_ring_size <= (1 << i))
671 break;
672 }
673 lp->rx_ring_size = (1 << i);
674 lp->rx_mod_mask = lp->rx_ring_size - 1;
675 lp->rx_len_bits = (i << 4);
676
677 if (pcnet32_alloc_ring(dev, dev->name)) {
678 pcnet32_free_ring(dev); 548 pcnet32_free_ring(dev);
679 spin_unlock_irqrestore(&lp->lock, flags); 549 lp->tx_ring_size =
680 return -ENOMEM; 550 min(ering->tx_pending, (unsigned int)TX_MAX_RING_SIZE);
681 } 551 lp->rx_ring_size =
552 min(ering->rx_pending, (unsigned int)RX_MAX_RING_SIZE);
553
554 /* set the minimum ring size to 4, to allow the loopback test to work
555 * unchanged.
556 */
557 for (i = 2; i <= PCNET32_LOG_MAX_TX_BUFFERS; i++) {
558 if (lp->tx_ring_size <= (1 << i))
559 break;
560 }
561 lp->tx_ring_size = (1 << i);
562 lp->tx_mod_mask = lp->tx_ring_size - 1;
563 lp->tx_len_bits = (i << 12);
682 564
683 spin_unlock_irqrestore(&lp->lock, flags); 565 for (i = 2; i <= PCNET32_LOG_MAX_RX_BUFFERS; i++) {
566 if (lp->rx_ring_size <= (1 << i))
567 break;
568 }
569 lp->rx_ring_size = (1 << i);
570 lp->rx_mod_mask = lp->rx_ring_size - 1;
571 lp->rx_len_bits = (i << 4);
572
573 if (pcnet32_alloc_ring(dev, dev->name)) {
574 pcnet32_free_ring(dev);
575 spin_unlock_irqrestore(&lp->lock, flags);
576 return -ENOMEM;
577 }
684 578
685 if (pcnet32_debug & NETIF_MSG_DRV) 579 spin_unlock_irqrestore(&lp->lock, flags);
686 printk(KERN_INFO PFX "%s: Ring Param Settings: RX: %d, TX: %d\n",
687 dev->name, lp->rx_ring_size, lp->tx_ring_size);
688 580
689 if (netif_running(dev)) 581 if (pcnet32_debug & NETIF_MSG_DRV)
690 pcnet32_open(dev); 582 printk(KERN_INFO PFX
583 "%s: Ring Param Settings: RX: %d, TX: %d\n", dev->name,
584 lp->rx_ring_size, lp->tx_ring_size);
691 585
692 return 0; 586 if (netif_running(dev))
587 pcnet32_open(dev);
588
589 return 0;
693} 590}
694 591
695static void pcnet32_get_strings(struct net_device *dev, u32 stringset, u8 *data) 592static void pcnet32_get_strings(struct net_device *dev, u32 stringset,
593 u8 * data)
696{ 594{
697 memcpy(data, pcnet32_gstrings_test, sizeof(pcnet32_gstrings_test)); 595 memcpy(data, pcnet32_gstrings_test, sizeof(pcnet32_gstrings_test));
698} 596}
699 597
700static int pcnet32_self_test_count(struct net_device *dev) 598static int pcnet32_self_test_count(struct net_device *dev)
701{ 599{
702 return PCNET32_TEST_LEN; 600 return PCNET32_TEST_LEN;
703} 601}
704 602
705static void pcnet32_ethtool_test(struct net_device *dev, 603static void pcnet32_ethtool_test(struct net_device *dev,
706 struct ethtool_test *test, u64 *data) 604 struct ethtool_test *test, u64 * data)
707{ 605{
708 struct pcnet32_private *lp = dev->priv; 606 struct pcnet32_private *lp = dev->priv;
709 int rc; 607 int rc;
710 608
711 if (test->flags == ETH_TEST_FL_OFFLINE) { 609 if (test->flags == ETH_TEST_FL_OFFLINE) {
712 rc = pcnet32_loopback_test(dev, data); 610 rc = pcnet32_loopback_test(dev, data);
713 if (rc) { 611 if (rc) {
714 if (netif_msg_hw(lp)) 612 if (netif_msg_hw(lp))
715 printk(KERN_DEBUG "%s: Loopback test failed.\n", dev->name); 613 printk(KERN_DEBUG "%s: Loopback test failed.\n",
716 test->flags |= ETH_TEST_FL_FAILED; 614 dev->name);
615 test->flags |= ETH_TEST_FL_FAILED;
616 } else if (netif_msg_hw(lp))
617 printk(KERN_DEBUG "%s: Loopback test passed.\n",
618 dev->name);
717 } else if (netif_msg_hw(lp)) 619 } else if (netif_msg_hw(lp))
718 printk(KERN_DEBUG "%s: Loopback test passed.\n", dev->name); 620 printk(KERN_DEBUG
719 } else if (netif_msg_hw(lp)) 621 "%s: No tests to run (specify 'Offline' on ethtool).",
720 printk(KERN_DEBUG "%s: No tests to run (specify 'Offline' on ethtool).", dev->name); 622 dev->name);
721} /* end pcnet32_ethtool_test */ 623} /* end pcnet32_ethtool_test */
722 624
723static int pcnet32_loopback_test(struct net_device *dev, uint64_t *data1) 625static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1)
724{ 626{
725 struct pcnet32_private *lp = dev->priv; 627 struct pcnet32_private *lp = dev->priv;
726 struct pcnet32_access *a = &lp->a; /* access to registers */ 628 struct pcnet32_access *a = &lp->a; /* access to registers */
727 ulong ioaddr = dev->base_addr; /* card base I/O address */ 629 ulong ioaddr = dev->base_addr; /* card base I/O address */
728 struct sk_buff *skb; /* sk buff */ 630 struct sk_buff *skb; /* sk buff */
729 int x, i; /* counters */ 631 int x, i; /* counters */
730 int numbuffs = 4; /* number of TX/RX buffers and descs */ 632 int numbuffs = 4; /* number of TX/RX buffers and descs */
731 u16 status = 0x8300; /* TX ring status */ 633 u16 status = 0x8300; /* TX ring status */
732 u16 teststatus; /* test of ring status */ 634 u16 teststatus; /* test of ring status */
733 int rc; /* return code */ 635 int rc; /* return code */
734 int size; /* size of packets */ 636 int size; /* size of packets */
735 unsigned char *packet; /* source packet data */ 637 unsigned char *packet; /* source packet data */
736 static const int data_len = 60; /* length of source packets */ 638 static const int data_len = 60; /* length of source packets */
737 unsigned long flags; 639 unsigned long flags;
738 unsigned long ticks; 640 unsigned long ticks;
739 641
740 *data1 = 1; /* status of test, default to fail */ 642 *data1 = 1; /* status of test, default to fail */
741 rc = 1; /* default to fail */ 643 rc = 1; /* default to fail */
742 644
743 if (netif_running(dev)) 645 if (netif_running(dev))
744 pcnet32_close(dev); 646 pcnet32_close(dev);
745 647
746 spin_lock_irqsave(&lp->lock, flags); 648 spin_lock_irqsave(&lp->lock, flags);
747 649
748 /* Reset the PCNET32 */ 650 /* Reset the PCNET32 */
749 lp->a.reset (ioaddr); 651 lp->a.reset(ioaddr);
750 652
751 /* switch pcnet32 to 32bit mode */ 653 /* switch pcnet32 to 32bit mode */
752 lp->a.write_bcr (ioaddr, 20, 2); 654 lp->a.write_bcr(ioaddr, 20, 2);
753 655
754 lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); 656 lp->init_block.mode =
755 lp->init_block.filter[0] = 0; 657 le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
756 lp->init_block.filter[1] = 0; 658 lp->init_block.filter[0] = 0;
757 659 lp->init_block.filter[1] = 0;
758 /* purge & init rings but don't actually restart */ 660
759 pcnet32_restart(dev, 0x0000); 661 /* purge & init rings but don't actually restart */
760 662 pcnet32_restart(dev, 0x0000);
761 lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */ 663
762 664 lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */
763 /* Initialize Transmit buffers. */ 665
764 size = data_len + 15; 666 /* Initialize Transmit buffers. */
765 for (x=0; x<numbuffs; x++) { 667 size = data_len + 15;
766 if (!(skb = dev_alloc_skb(size))) { 668 for (x = 0; x < numbuffs; x++) {
767 if (netif_msg_hw(lp)) 669 if (!(skb = dev_alloc_skb(size))) {
768 printk(KERN_DEBUG "%s: Cannot allocate skb at line: %d!\n", 670 if (netif_msg_hw(lp))
769 dev->name, __LINE__); 671 printk(KERN_DEBUG
770 goto clean_up; 672 "%s: Cannot allocate skb at line: %d!\n",
771 } else { 673 dev->name, __LINE__);
772 packet = skb->data; 674 goto clean_up;
773 skb_put(skb, size); /* create space for data */ 675 } else {
774 lp->tx_skbuff[x] = skb; 676 packet = skb->data;
775 lp->tx_ring[x].length = le16_to_cpu(-skb->len); 677 skb_put(skb, size); /* create space for data */
776 lp->tx_ring[x].misc = 0; 678 lp->tx_skbuff[x] = skb;
777 679 lp->tx_ring[x].length = le16_to_cpu(-skb->len);
778 /* put DA and SA into the skb */ 680 lp->tx_ring[x].misc = 0;
779 for (i=0; i<6; i++) 681
780 *packet++ = dev->dev_addr[i]; 682 /* put DA and SA into the skb */
781 for (i=0; i<6; i++) 683 for (i = 0; i < 6; i++)
782 *packet++ = dev->dev_addr[i]; 684 *packet++ = dev->dev_addr[i];
783 /* type */ 685 for (i = 0; i < 6; i++)
784 *packet++ = 0x08; 686 *packet++ = dev->dev_addr[i];
785 *packet++ = 0x06; 687 /* type */
786 /* packet number */ 688 *packet++ = 0x08;
787 *packet++ = x; 689 *packet++ = 0x06;
788 /* fill packet with data */ 690 /* packet number */
789 for (i=0; i<data_len; i++) 691 *packet++ = x;
790 *packet++ = i; 692 /* fill packet with data */
791 693 for (i = 0; i < data_len; i++)
792 lp->tx_dma_addr[x] = pci_map_single(lp->pci_dev, skb->data, 694 *packet++ = i;
793 skb->len, PCI_DMA_TODEVICE); 695
794 lp->tx_ring[x].base = (u32)le32_to_cpu(lp->tx_dma_addr[x]); 696 lp->tx_dma_addr[x] =
795 wmb(); /* Make sure owner changes after all others are visible */ 697 pci_map_single(lp->pci_dev, skb->data, skb->len,
796 lp->tx_ring[x].status = le16_to_cpu(status); 698 PCI_DMA_TODEVICE);
797 } 699 lp->tx_ring[x].base =
798 } 700 (u32) le32_to_cpu(lp->tx_dma_addr[x]);
799 701 wmb(); /* Make sure owner changes after all others are visible */
800 x = a->read_bcr(ioaddr, 32); /* set internal loopback in BSR32 */ 702 lp->tx_ring[x].status = le16_to_cpu(status);
801 x = x | 0x0002; 703 }
802 a->write_bcr(ioaddr, 32, x); 704 }
803 705
804 lp->a.write_csr (ioaddr, 15, 0x0044); /* set int loopback in CSR15 */ 706 x = a->read_bcr(ioaddr, 32); /* set internal loopback in BSR32 */
805 707 x = x | 0x0002;
806 teststatus = le16_to_cpu(0x8000); 708 a->write_bcr(ioaddr, 32, x);
807 lp->a.write_csr(ioaddr, 0, 0x0002); /* Set STRT bit */ 709
808 710 lp->a.write_csr(ioaddr, 15, 0x0044); /* set int loopback in CSR15 */
809 /* Check status of descriptors */ 711
810 for (x=0; x<numbuffs; x++) { 712 teststatus = le16_to_cpu(0x8000);
811 ticks = 0; 713 lp->a.write_csr(ioaddr, 0, 0x0002); /* Set STRT bit */
812 rmb(); 714
813 while ((lp->rx_ring[x].status & teststatus) && (ticks < 200)) { 715 /* Check status of descriptors */
814 spin_unlock_irqrestore(&lp->lock, flags); 716 for (x = 0; x < numbuffs; x++) {
815 mdelay(1); 717 ticks = 0;
816 spin_lock_irqsave(&lp->lock, flags); 718 rmb();
817 rmb(); 719 while ((lp->rx_ring[x].status & teststatus) && (ticks < 200)) {
818 ticks++; 720 spin_unlock_irqrestore(&lp->lock, flags);
819 } 721 mdelay(1);
820 if (ticks == 200) { 722 spin_lock_irqsave(&lp->lock, flags);
821 if (netif_msg_hw(lp)) 723 rmb();
822 printk("%s: Desc %d failed to reset!\n",dev->name,x); 724 ticks++;
823 break; 725 }
824 } 726 if (ticks == 200) {
825 } 727 if (netif_msg_hw(lp))
826 728 printk("%s: Desc %d failed to reset!\n",
827 lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */ 729 dev->name, x);
828 wmb(); 730 break;
829 if (netif_msg_hw(lp) && netif_msg_pktdata(lp)) { 731 }
830 printk(KERN_DEBUG "%s: RX loopback packets:\n", dev->name); 732 }
831 733
832 for (x=0; x<numbuffs; x++) { 734 lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */
833 printk(KERN_DEBUG "%s: Packet %d:\n", dev->name, x); 735 wmb();
834 skb = lp->rx_skbuff[x]; 736 if (netif_msg_hw(lp) && netif_msg_pktdata(lp)) {
835 for (i=0; i<size; i++) { 737 printk(KERN_DEBUG "%s: RX loopback packets:\n", dev->name);
836 printk("%02x ", *(skb->data+i)); 738
837 } 739 for (x = 0; x < numbuffs; x++) {
838 printk("\n"); 740 printk(KERN_DEBUG "%s: Packet %d:\n", dev->name, x);
839 } 741 skb = lp->rx_skbuff[x];
840 } 742 for (i = 0; i < size; i++) {
841 743 printk("%02x ", *(skb->data + i));
842 x = 0; 744 }
843 rc = 0; 745 printk("\n");
844 while (x<numbuffs && !rc) { 746 }
845 skb = lp->rx_skbuff[x]; 747 }
846 packet = lp->tx_skbuff[x]->data; 748
847 for (i=0; i<size; i++) { 749 x = 0;
848 if (*(skb->data+i) != packet[i]) { 750 rc = 0;
849 if (netif_msg_hw(lp)) 751 while (x < numbuffs && !rc) {
850 printk(KERN_DEBUG "%s: Error in compare! %2x - %02x %02x\n", 752 skb = lp->rx_skbuff[x];
851 dev->name, i, *(skb->data+i), packet[i]); 753 packet = lp->tx_skbuff[x]->data;
852 rc = 1; 754 for (i = 0; i < size; i++) {
853 break; 755 if (*(skb->data + i) != packet[i]) {
854 } 756 if (netif_msg_hw(lp))
757 printk(KERN_DEBUG
758 "%s: Error in compare! %2x - %02x %02x\n",
759 dev->name, i, *(skb->data + i),
760 packet[i]);
761 rc = 1;
762 break;
763 }
764 }
765 x++;
766 }
767 if (!rc) {
768 *data1 = 0;
855 } 769 }
856 x++;
857 }
858 if (!rc) {
859 *data1 = 0;
860 }
861 770
862clean_up: 771 clean_up:
863 pcnet32_purge_tx_ring(dev); 772 pcnet32_purge_tx_ring(dev);
864 x = a->read_csr(ioaddr, 15) & 0xFFFF; 773 x = a->read_csr(ioaddr, 15) & 0xFFFF;
865 a->write_csr(ioaddr, 15, (x & ~0x0044)); /* reset bits 6 and 2 */ 774 a->write_csr(ioaddr, 15, (x & ~0x0044)); /* reset bits 6 and 2 */
866 775
867 x = a->read_bcr(ioaddr, 32); /* reset internal loopback */ 776 x = a->read_bcr(ioaddr, 32); /* reset internal loopback */
868 x = x & ~0x0002; 777 x = x & ~0x0002;
869 a->write_bcr(ioaddr, 32, x); 778 a->write_bcr(ioaddr, 32, x);
870 779
871 spin_unlock_irqrestore(&lp->lock, flags); 780 spin_unlock_irqrestore(&lp->lock, flags);
872 781
873 if (netif_running(dev)) { 782 if (netif_running(dev)) {
874 pcnet32_open(dev); 783 pcnet32_open(dev);
875 } else { 784 } else {
876 lp->a.write_bcr (ioaddr, 20, 4); /* return to 16bit mode */ 785 lp->a.write_bcr(ioaddr, 20, 4); /* return to 16bit mode */
877 } 786 }
878 787
879 return(rc); 788 return (rc);
880} /* end pcnet32_loopback_test */ 789} /* end pcnet32_loopback_test */
881 790
882static void pcnet32_led_blink_callback(struct net_device *dev) 791static void pcnet32_led_blink_callback(struct net_device *dev)
883{ 792{
884 struct pcnet32_private *lp = dev->priv; 793 struct pcnet32_private *lp = dev->priv;
885 struct pcnet32_access *a = &lp->a; 794 struct pcnet32_access *a = &lp->a;
886 ulong ioaddr = dev->base_addr; 795 ulong ioaddr = dev->base_addr;
887 unsigned long flags; 796 unsigned long flags;
888 int i; 797 int i;
889 798
890 spin_lock_irqsave(&lp->lock, flags); 799 spin_lock_irqsave(&lp->lock, flags);
891 for (i=4; i<8; i++) { 800 for (i = 4; i < 8; i++) {
892 a->write_bcr(ioaddr, i, a->read_bcr(ioaddr, i) ^ 0x4000); 801 a->write_bcr(ioaddr, i, a->read_bcr(ioaddr, i) ^ 0x4000);
893 } 802 }
894 spin_unlock_irqrestore(&lp->lock, flags); 803 spin_unlock_irqrestore(&lp->lock, flags);
895 804
896 mod_timer(&lp->blink_timer, PCNET32_BLINK_TIMEOUT); 805 mod_timer(&lp->blink_timer, PCNET32_BLINK_TIMEOUT);
897} 806}
898 807
899static int pcnet32_phys_id(struct net_device *dev, u32 data) 808static int pcnet32_phys_id(struct net_device *dev, u32 data)
900{ 809{
901 struct pcnet32_private *lp = dev->priv; 810 struct pcnet32_private *lp = dev->priv;
902 struct pcnet32_access *a = &lp->a; 811 struct pcnet32_access *a = &lp->a;
903 ulong ioaddr = dev->base_addr; 812 ulong ioaddr = dev->base_addr;
904 unsigned long flags; 813 unsigned long flags;
905 int i, regs[4]; 814 int i, regs[4];
906 815
907 if (!lp->blink_timer.function) { 816 if (!lp->blink_timer.function) {
908 init_timer(&lp->blink_timer); 817 init_timer(&lp->blink_timer);
909 lp->blink_timer.function = (void *) pcnet32_led_blink_callback; 818 lp->blink_timer.function = (void *)pcnet32_led_blink_callback;
910 lp->blink_timer.data = (unsigned long) dev; 819 lp->blink_timer.data = (unsigned long)dev;
911 } 820 }
912 821
913 /* Save the current value of the bcrs */ 822 /* Save the current value of the bcrs */
914 spin_lock_irqsave(&lp->lock, flags); 823 spin_lock_irqsave(&lp->lock, flags);
915 for (i=4; i<8; i++) { 824 for (i = 4; i < 8; i++) {
916 regs[i-4] = a->read_bcr(ioaddr, i); 825 regs[i - 4] = a->read_bcr(ioaddr, i);
917 } 826 }
918 spin_unlock_irqrestore(&lp->lock, flags); 827 spin_unlock_irqrestore(&lp->lock, flags);
919 828
920 mod_timer(&lp->blink_timer, jiffies); 829 mod_timer(&lp->blink_timer, jiffies);
921 set_current_state(TASK_INTERRUPTIBLE); 830 set_current_state(TASK_INTERRUPTIBLE);
922 831
923 if ((!data) || (data > (u32)(MAX_SCHEDULE_TIMEOUT / HZ))) 832 if ((!data) || (data > (u32) (MAX_SCHEDULE_TIMEOUT / HZ)))
924 data = (u32)(MAX_SCHEDULE_TIMEOUT / HZ); 833 data = (u32) (MAX_SCHEDULE_TIMEOUT / HZ);
925 834
926 msleep_interruptible(data * 1000); 835 msleep_interruptible(data * 1000);
927 del_timer_sync(&lp->blink_timer); 836 del_timer_sync(&lp->blink_timer);
928 837
929 /* Restore the original value of the bcrs */ 838 /* Restore the original value of the bcrs */
930 spin_lock_irqsave(&lp->lock, flags); 839 spin_lock_irqsave(&lp->lock, flags);
931 for (i=4; i<8; i++) { 840 for (i = 4; i < 8; i++) {
932 a->write_bcr(ioaddr, i, regs[i-4]); 841 a->write_bcr(ioaddr, i, regs[i - 4]);
933 } 842 }
934 spin_unlock_irqrestore(&lp->lock, flags); 843 spin_unlock_irqrestore(&lp->lock, flags);
935 844
936 return 0; 845 return 0;
937} 846}
938 847
848#define PCNET32_REGS_PER_PHY 32
849#define PCNET32_MAX_PHYS 32
939static int pcnet32_get_regs_len(struct net_device *dev) 850static int pcnet32_get_regs_len(struct net_device *dev)
940{ 851{
941 return(PCNET32_NUM_REGS * sizeof(u16)); 852 struct pcnet32_private *lp = dev->priv;
853 int j = lp->phycount * PCNET32_REGS_PER_PHY;
854
855 return ((PCNET32_NUM_REGS + j) * sizeof(u16));
942} 856}
943 857
944static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs, 858static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs,
945 void *ptr) 859 void *ptr)
946{ 860{
947 int i, csr0; 861 int i, csr0;
948 u16 *buff = ptr; 862 u16 *buff = ptr;
949 struct pcnet32_private *lp = dev->priv; 863 struct pcnet32_private *lp = dev->priv;
950 struct pcnet32_access *a = &lp->a; 864 struct pcnet32_access *a = &lp->a;
951 ulong ioaddr = dev->base_addr; 865 ulong ioaddr = dev->base_addr;
952 int ticks; 866 int ticks;
953 unsigned long flags; 867 unsigned long flags;
954
955 spin_lock_irqsave(&lp->lock, flags);
956
957 csr0 = a->read_csr(ioaddr, 0);
958 if (!(csr0 & 0x0004)) { /* If not stopped */
959 /* set SUSPEND (SPND) - CSR5 bit 0 */
960 a->write_csr(ioaddr, 5, 0x0001);
961
962 /* poll waiting for bit to be set */
963 ticks = 0;
964 while (!(a->read_csr(ioaddr, 5) & 0x0001)) {
965 spin_unlock_irqrestore(&lp->lock, flags);
966 mdelay(1);
967 spin_lock_irqsave(&lp->lock, flags);
968 ticks++;
969 if (ticks > 200) {
970 if (netif_msg_hw(lp))
971 printk(KERN_DEBUG "%s: Error getting into suspend!\n",
972 dev->name);
973 break;
974 }
975 }
976 }
977 868
978 /* read address PROM */ 869 spin_lock_irqsave(&lp->lock, flags);
979 for (i=0; i<16; i += 2)
980 *buff++ = inw(ioaddr + i);
981 870
982 /* read control and status registers */ 871 csr0 = a->read_csr(ioaddr, 0);
983 for (i=0; i<90; i++) { 872 if (!(csr0 & 0x0004)) { /* If not stopped */
984 *buff++ = a->read_csr(ioaddr, i); 873 /* set SUSPEND (SPND) - CSR5 bit 0 */
985 } 874 a->write_csr(ioaddr, 5, 0x0001);
875
876 /* poll waiting for bit to be set */
877 ticks = 0;
878 while (!(a->read_csr(ioaddr, 5) & 0x0001)) {
879 spin_unlock_irqrestore(&lp->lock, flags);
880 mdelay(1);
881 spin_lock_irqsave(&lp->lock, flags);
882 ticks++;
883 if (ticks > 200) {
884 if (netif_msg_hw(lp))
885 printk(KERN_DEBUG
886 "%s: Error getting into suspend!\n",
887 dev->name);
888 break;
889 }
890 }
891 }
986 892
987 *buff++ = a->read_csr(ioaddr, 112); 893 /* read address PROM */
988 *buff++ = a->read_csr(ioaddr, 114); 894 for (i = 0; i < 16; i += 2)
895 *buff++ = inw(ioaddr + i);
989 896
990 /* read bus configuration registers */ 897 /* read control and status registers */
991 for (i=0; i<30; i++) { 898 for (i = 0; i < 90; i++) {
992 *buff++ = a->read_bcr(ioaddr, i); 899 *buff++ = a->read_csr(ioaddr, i);
993 } 900 }
994 *buff++ = 0; /* skip bcr30 so as not to hang 79C976 */ 901
995 for (i=31; i<36; i++) { 902 *buff++ = a->read_csr(ioaddr, 112);
996 *buff++ = a->read_bcr(ioaddr, i); 903 *buff++ = a->read_csr(ioaddr, 114);
997 }
998 904
999 /* read mii phy registers */ 905 /* read bus configuration registers */
1000 if (lp->mii) { 906 for (i = 0; i < 30; i++) {
1001 for (i=0; i<32; i++) { 907 *buff++ = a->read_bcr(ioaddr, i);
1002 lp->a.write_bcr(ioaddr, 33, ((lp->mii_if.phy_id) << 5) | i); 908 }
1003 *buff++ = lp->a.read_bcr(ioaddr, 34); 909 *buff++ = 0; /* skip bcr30 so as not to hang 79C976 */
910 for (i = 31; i < 36; i++) {
911 *buff++ = a->read_bcr(ioaddr, i);
1004 } 912 }
1005 }
1006 913
1007 if (!(csr0 & 0x0004)) { /* If not stopped */ 914 /* read mii phy registers */
1008 /* clear SUSPEND (SPND) - CSR5 bit 0 */ 915 if (lp->mii) {
1009 a->write_csr(ioaddr, 5, 0x0000); 916 int j;
1010 } 917 for (j = 0; j < PCNET32_MAX_PHYS; j++) {
918 if (lp->phymask & (1 << j)) {
919 for (i = 0; i < PCNET32_REGS_PER_PHY; i++) {
920 lp->a.write_bcr(ioaddr, 33,
921 (j << 5) | i);
922 *buff++ = lp->a.read_bcr(ioaddr, 34);
923 }
924 }
925 }
926 }
1011 927
1012 i = buff - (u16 *)ptr; 928 if (!(csr0 & 0x0004)) { /* If not stopped */
1013 for (; i < PCNET32_NUM_REGS; i++) 929 /* clear SUSPEND (SPND) - CSR5 bit 0 */
1014 *buff++ = 0; 930 a->write_csr(ioaddr, 5, 0x0000);
931 }
1015 932
1016 spin_unlock_irqrestore(&lp->lock, flags); 933 spin_unlock_irqrestore(&lp->lock, flags);
1017} 934}
1018 935
1019static struct ethtool_ops pcnet32_ethtool_ops = { 936static struct ethtool_ops pcnet32_ethtool_ops = {
1020 .get_settings = pcnet32_get_settings, 937 .get_settings = pcnet32_get_settings,
1021 .set_settings = pcnet32_set_settings, 938 .set_settings = pcnet32_set_settings,
1022 .get_drvinfo = pcnet32_get_drvinfo, 939 .get_drvinfo = pcnet32_get_drvinfo,
1023 .get_msglevel = pcnet32_get_msglevel, 940 .get_msglevel = pcnet32_get_msglevel,
1024 .set_msglevel = pcnet32_set_msglevel, 941 .set_msglevel = pcnet32_set_msglevel,
1025 .nway_reset = pcnet32_nway_reset, 942 .nway_reset = pcnet32_nway_reset,
1026 .get_link = pcnet32_get_link, 943 .get_link = pcnet32_get_link,
1027 .get_ringparam = pcnet32_get_ringparam, 944 .get_ringparam = pcnet32_get_ringparam,
1028 .set_ringparam = pcnet32_set_ringparam, 945 .set_ringparam = pcnet32_set_ringparam,
1029 .get_tx_csum = ethtool_op_get_tx_csum, 946 .get_tx_csum = ethtool_op_get_tx_csum,
1030 .get_sg = ethtool_op_get_sg, 947 .get_sg = ethtool_op_get_sg,
1031 .get_tso = ethtool_op_get_tso, 948 .get_tso = ethtool_op_get_tso,
1032 .get_strings = pcnet32_get_strings, 949 .get_strings = pcnet32_get_strings,
1033 .self_test_count = pcnet32_self_test_count, 950 .self_test_count = pcnet32_self_test_count,
1034 .self_test = pcnet32_ethtool_test, 951 .self_test = pcnet32_ethtool_test,
1035 .phys_id = pcnet32_phys_id, 952 .phys_id = pcnet32_phys_id,
1036 .get_regs_len = pcnet32_get_regs_len, 953 .get_regs_len = pcnet32_get_regs_len,
1037 .get_regs = pcnet32_get_regs, 954 .get_regs = pcnet32_get_regs,
1038 .get_perm_addr = ethtool_op_get_perm_addr, 955 .get_perm_addr = ethtool_op_get_perm_addr,
1039}; 956};
1040 957
1041/* only probes for non-PCI devices, the rest are handled by 958/* only probes for non-PCI devices, the rest are handled by
1042 * pci_register_driver via pcnet32_probe_pci */ 959 * pci_register_driver via pcnet32_probe_pci */
1043 960
1044static void __devinit 961static void __devinit pcnet32_probe_vlbus(void)
1045pcnet32_probe_vlbus(void)
1046{ 962{
1047 unsigned int *port, ioaddr; 963 unsigned int *port, ioaddr;
1048 964
1049 /* search for PCnet32 VLB cards at known addresses */ 965 /* search for PCnet32 VLB cards at known addresses */
1050 for (port = pcnet32_portlist; (ioaddr = *port); port++) { 966 for (port = pcnet32_portlist; (ioaddr = *port); port++) {
1051 if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_vlbus")) { 967 if (request_region
1052 /* check if there is really a pcnet chip on that ioaddr */ 968 (ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_vlbus")) {
1053 if ((inb(ioaddr + 14) == 0x57) && (inb(ioaddr + 15) == 0x57)) { 969 /* check if there is really a pcnet chip on that ioaddr */
1054 pcnet32_probe1(ioaddr, 0, NULL); 970 if ((inb(ioaddr + 14) == 0x57)
1055 } else { 971 && (inb(ioaddr + 15) == 0x57)) {
1056 release_region(ioaddr, PCNET32_TOTAL_SIZE); 972 pcnet32_probe1(ioaddr, 0, NULL);
1057 } 973 } else {
1058 } 974 release_region(ioaddr, PCNET32_TOTAL_SIZE);
1059 } 975 }
976 }
977 }
1060} 978}
1061 979
1062
1063static int __devinit 980static int __devinit
1064pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent) 981pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent)
1065{ 982{
1066 unsigned long ioaddr; 983 unsigned long ioaddr;
1067 int err; 984 int err;
1068 985
1069 err = pci_enable_device(pdev); 986 err = pci_enable_device(pdev);
1070 if (err < 0) { 987 if (err < 0) {
1071 if (pcnet32_debug & NETIF_MSG_PROBE) 988 if (pcnet32_debug & NETIF_MSG_PROBE)
1072 printk(KERN_ERR PFX "failed to enable device -- err=%d\n", err); 989 printk(KERN_ERR PFX
1073 return err; 990 "failed to enable device -- err=%d\n", err);
1074 } 991 return err;
1075 pci_set_master(pdev); 992 }
993 pci_set_master(pdev);
994
995 ioaddr = pci_resource_start(pdev, 0);
996 if (!ioaddr) {
997 if (pcnet32_debug & NETIF_MSG_PROBE)
998 printk(KERN_ERR PFX
999 "card has no PCI IO resources, aborting\n");
1000 return -ENODEV;
1001 }
1076 1002
1077 ioaddr = pci_resource_start (pdev, 0); 1003 if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) {
1078 if (!ioaddr) { 1004 if (pcnet32_debug & NETIF_MSG_PROBE)
1079 if (pcnet32_debug & NETIF_MSG_PROBE) 1005 printk(KERN_ERR PFX
1080 printk (KERN_ERR PFX "card has no PCI IO resources, aborting\n"); 1006 "architecture does not support 32bit PCI busmaster DMA\n");
1081 return -ENODEV; 1007 return -ENODEV;
1082 } 1008 }
1009 if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_pci") ==
1010 NULL) {
1011 if (pcnet32_debug & NETIF_MSG_PROBE)
1012 printk(KERN_ERR PFX
1013 "io address range already allocated\n");
1014 return -EBUSY;
1015 }
1083 1016
1084 if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) { 1017 err = pcnet32_probe1(ioaddr, 1, pdev);
1085 if (pcnet32_debug & NETIF_MSG_PROBE) 1018 if (err < 0) {
1086 printk(KERN_ERR PFX "architecture does not support 32bit PCI busmaster DMA\n"); 1019 pci_disable_device(pdev);
1087 return -ENODEV; 1020 }
1088 } 1021 return err;
1089 if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_pci") == NULL) {
1090 if (pcnet32_debug & NETIF_MSG_PROBE)
1091 printk(KERN_ERR PFX "io address range already allocated\n");
1092 return -EBUSY;
1093 }
1094
1095 err = pcnet32_probe1(ioaddr, 1, pdev);
1096 if (err < 0) {
1097 pci_disable_device(pdev);
1098 }
1099 return err;
1100} 1022}
1101 1023
1102
1103/* pcnet32_probe1 1024/* pcnet32_probe1
1104 * Called from both pcnet32_probe_vlbus and pcnet_probe_pci. 1025 * Called from both pcnet32_probe_vlbus and pcnet_probe_pci.
1105 * pdev will be NULL when called from pcnet32_probe_vlbus. 1026 * pdev will be NULL when called from pcnet32_probe_vlbus.
@@ -1107,630 +1028,764 @@ pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent)
1107static int __devinit 1028static int __devinit
1108pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) 1029pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
1109{ 1030{
1110 struct pcnet32_private *lp; 1031 struct pcnet32_private *lp;
1111 dma_addr_t lp_dma_addr; 1032 dma_addr_t lp_dma_addr;
1112 int i, media; 1033 int i, media;
1113 int fdx, mii, fset, dxsuflo; 1034 int fdx, mii, fset, dxsuflo;
1114 int chip_version; 1035 int chip_version;
1115 char *chipname; 1036 char *chipname;
1116 struct net_device *dev; 1037 struct net_device *dev;
1117 struct pcnet32_access *a = NULL; 1038 struct pcnet32_access *a = NULL;
1118 u8 promaddr[6]; 1039 u8 promaddr[6];
1119 int ret = -ENODEV; 1040 int ret = -ENODEV;
1120 1041
1121 /* reset the chip */ 1042 /* reset the chip */
1122 pcnet32_wio_reset(ioaddr); 1043 pcnet32_wio_reset(ioaddr);
1123 1044
1124 /* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */ 1045 /* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */
1125 if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) { 1046 if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) {
1126 a = &pcnet32_wio; 1047 a = &pcnet32_wio;
1127 } else { 1048 } else {
1128 pcnet32_dwio_reset(ioaddr); 1049 pcnet32_dwio_reset(ioaddr);
1129 if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) { 1050 if (pcnet32_dwio_read_csr(ioaddr, 0) == 4
1130 a = &pcnet32_dwio; 1051 && pcnet32_dwio_check(ioaddr)) {
1131 } else 1052 a = &pcnet32_dwio;
1132 goto err_release_region; 1053 } else
1133 } 1054 goto err_release_region;
1134 1055 }
1135 chip_version = a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr,89) << 16); 1056
1136 if ((pcnet32_debug & NETIF_MSG_PROBE) && (pcnet32_debug & NETIF_MSG_HW)) 1057 chip_version =
1137 printk(KERN_INFO " PCnet chip version is %#x.\n", chip_version); 1058 a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr, 89) << 16);
1138 if ((chip_version & 0xfff) != 0x003) { 1059 if ((pcnet32_debug & NETIF_MSG_PROBE) && (pcnet32_debug & NETIF_MSG_HW))
1139 if (pcnet32_debug & NETIF_MSG_PROBE) 1060 printk(KERN_INFO " PCnet chip version is %#x.\n",
1140 printk(KERN_INFO PFX "Unsupported chip version.\n"); 1061 chip_version);
1141 goto err_release_region; 1062 if ((chip_version & 0xfff) != 0x003) {
1142 } 1063 if (pcnet32_debug & NETIF_MSG_PROBE)
1143 1064 printk(KERN_INFO PFX "Unsupported chip version.\n");
1144 /* initialize variables */ 1065 goto err_release_region;
1145 fdx = mii = fset = dxsuflo = 0; 1066 }
1146 chip_version = (chip_version >> 12) & 0xffff; 1067
1147 1068 /* initialize variables */
1148 switch (chip_version) { 1069 fdx = mii = fset = dxsuflo = 0;
1149 case 0x2420: 1070 chip_version = (chip_version >> 12) & 0xffff;
1150 chipname = "PCnet/PCI 79C970"; /* PCI */ 1071
1151 break; 1072 switch (chip_version) {
1152 case 0x2430: 1073 case 0x2420:
1153 if (shared) 1074 chipname = "PCnet/PCI 79C970"; /* PCI */
1154 chipname = "PCnet/PCI 79C970"; /* 970 gives the wrong chip id back */ 1075 break;
1155 else 1076 case 0x2430:
1156 chipname = "PCnet/32 79C965"; /* 486/VL bus */ 1077 if (shared)
1157 break; 1078 chipname = "PCnet/PCI 79C970"; /* 970 gives the wrong chip id back */
1158 case 0x2621: 1079 else
1159 chipname = "PCnet/PCI II 79C970A"; /* PCI */ 1080 chipname = "PCnet/32 79C965"; /* 486/VL bus */
1160 fdx = 1; 1081 break;
1161 break; 1082 case 0x2621:
1162 case 0x2623: 1083 chipname = "PCnet/PCI II 79C970A"; /* PCI */
1163 chipname = "PCnet/FAST 79C971"; /* PCI */ 1084 fdx = 1;
1164 fdx = 1; mii = 1; fset = 1; 1085 break;
1165 break; 1086 case 0x2623:
1166 case 0x2624: 1087 chipname = "PCnet/FAST 79C971"; /* PCI */
1167 chipname = "PCnet/FAST+ 79C972"; /* PCI */ 1088 fdx = 1;
1168 fdx = 1; mii = 1; fset = 1; 1089 mii = 1;
1169 break; 1090 fset = 1;
1170 case 0x2625: 1091 break;
1171 chipname = "PCnet/FAST III 79C973"; /* PCI */ 1092 case 0x2624:
1172 fdx = 1; mii = 1; 1093 chipname = "PCnet/FAST+ 79C972"; /* PCI */
1173 break; 1094 fdx = 1;
1174 case 0x2626: 1095 mii = 1;
1175 chipname = "PCnet/Home 79C978"; /* PCI */ 1096 fset = 1;
1176 fdx = 1; 1097 break;
1098 case 0x2625:
1099 chipname = "PCnet/FAST III 79C973"; /* PCI */
1100 fdx = 1;
1101 mii = 1;
1102 break;
1103 case 0x2626:
1104 chipname = "PCnet/Home 79C978"; /* PCI */
1105 fdx = 1;
1106 /*
1107 * This is based on specs published at www.amd.com. This section
1108 * assumes that a card with a 79C978 wants to go into standard
1109 * ethernet mode. The 79C978 can also go into 1Mb HomePNA mode,
1110 * and the module option homepna=1 can select this instead.
1111 */
1112 media = a->read_bcr(ioaddr, 49);
1113 media &= ~3; /* default to 10Mb ethernet */
1114 if (cards_found < MAX_UNITS && homepna[cards_found])
1115 media |= 1; /* switch to home wiring mode */
1116 if (pcnet32_debug & NETIF_MSG_PROBE)
1117 printk(KERN_DEBUG PFX "media set to %sMbit mode.\n",
1118 (media & 1) ? "1" : "10");
1119 a->write_bcr(ioaddr, 49, media);
1120 break;
1121 case 0x2627:
1122 chipname = "PCnet/FAST III 79C975"; /* PCI */
1123 fdx = 1;
1124 mii = 1;
1125 break;
1126 case 0x2628:
1127 chipname = "PCnet/PRO 79C976";
1128 fdx = 1;
1129 mii = 1;
1130 break;
1131 default:
1132 if (pcnet32_debug & NETIF_MSG_PROBE)
1133 printk(KERN_INFO PFX
1134 "PCnet version %#x, no PCnet32 chip.\n",
1135 chip_version);
1136 goto err_release_region;
1137 }
1138
1177 /* 1139 /*
1178 * This is based on specs published at www.amd.com. This section 1140 * On selected chips turn on the BCR18:NOUFLO bit. This stops transmit
1179 * assumes that a card with a 79C978 wants to go into standard 1141 * starting until the packet is loaded. Strike one for reliability, lose
1180 * ethernet mode. The 79C978 can also go into 1Mb HomePNA mode, 1142 * one for latency - although on PCI this isnt a big loss. Older chips
1181 * and the module option homepna=1 can select this instead. 1143 * have FIFO's smaller than a packet, so you can't do this.
1144 * Turn on BCR18:BurstRdEn and BCR18:BurstWrEn.
1182 */ 1145 */
1183 media = a->read_bcr(ioaddr, 49); 1146
1184 media &= ~3; /* default to 10Mb ethernet */ 1147 if (fset) {
1185 if (cards_found < MAX_UNITS && homepna[cards_found]) 1148 a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0860));
1186 media |= 1; /* switch to home wiring mode */ 1149 a->write_csr(ioaddr, 80,
1187 if (pcnet32_debug & NETIF_MSG_PROBE) 1150 (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00);
1188 printk(KERN_DEBUG PFX "media set to %sMbit mode.\n", 1151 dxsuflo = 1;
1189 (media & 1) ? "1" : "10"); 1152 }
1190 a->write_bcr(ioaddr, 49, media); 1153
1191 break; 1154 dev = alloc_etherdev(0);
1192 case 0x2627: 1155 if (!dev) {
1193 chipname = "PCnet/FAST III 79C975"; /* PCI */ 1156 if (pcnet32_debug & NETIF_MSG_PROBE)
1194 fdx = 1; mii = 1; 1157 printk(KERN_ERR PFX "Memory allocation failed.\n");
1195 break; 1158 ret = -ENOMEM;
1196 case 0x2628: 1159 goto err_release_region;
1197 chipname = "PCnet/PRO 79C976"; 1160 }
1198 fdx = 1; mii = 1; 1161 SET_NETDEV_DEV(dev, &pdev->dev);
1199 break; 1162
1200 default:
1201 if (pcnet32_debug & NETIF_MSG_PROBE)
1202 printk(KERN_INFO PFX "PCnet version %#x, no PCnet32 chip.\n",
1203 chip_version);
1204 goto err_release_region;
1205 }
1206
1207 /*
1208 * On selected chips turn on the BCR18:NOUFLO bit. This stops transmit
1209 * starting until the packet is loaded. Strike one for reliability, lose
1210 * one for latency - although on PCI this isnt a big loss. Older chips
1211 * have FIFO's smaller than a packet, so you can't do this.
1212 * Turn on BCR18:BurstRdEn and BCR18:BurstWrEn.
1213 */
1214
1215 if (fset) {
1216 a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0860));
1217 a->write_csr(ioaddr, 80, (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00);
1218 dxsuflo = 1;
1219 }
1220
1221 dev = alloc_etherdev(0);
1222 if (!dev) {
1223 if (pcnet32_debug & NETIF_MSG_PROBE) 1163 if (pcnet32_debug & NETIF_MSG_PROBE)
1224 printk(KERN_ERR PFX "Memory allocation failed.\n"); 1164 printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr);
1225 ret = -ENOMEM; 1165
1226 goto err_release_region; 1166 /* In most chips, after a chip reset, the ethernet address is read from the
1227 } 1167 * station address PROM at the base address and programmed into the
1228 SET_NETDEV_DEV(dev, &pdev->dev); 1168 * "Physical Address Registers" CSR12-14.
1229 1169 * As a precautionary measure, we read the PROM values and complain if
1230 if (pcnet32_debug & NETIF_MSG_PROBE) 1170 * they disagree with the CSRs. Either way, we use the CSR values, and
1231 printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); 1171 * double check that they are valid.
1232 1172 */
1233 /* In most chips, after a chip reset, the ethernet address is read from the 1173 for (i = 0; i < 3; i++) {
1234 * station address PROM at the base address and programmed into the 1174 unsigned int val;
1235 * "Physical Address Registers" CSR12-14. 1175 val = a->read_csr(ioaddr, i + 12) & 0x0ffff;
1236 * As a precautionary measure, we read the PROM values and complain if 1176 /* There may be endianness issues here. */
1237 * they disagree with the CSRs. Either way, we use the CSR values, and 1177 dev->dev_addr[2 * i] = val & 0x0ff;
1238 * double check that they are valid. 1178 dev->dev_addr[2 * i + 1] = (val >> 8) & 0x0ff;
1239 */ 1179 }
1240 for (i = 0; i < 3; i++) { 1180
1241 unsigned int val; 1181 /* read PROM address and compare with CSR address */
1242 val = a->read_csr(ioaddr, i+12) & 0x0ffff;
1243 /* There may be endianness issues here. */
1244 dev->dev_addr[2*i] = val & 0x0ff;
1245 dev->dev_addr[2*i+1] = (val >> 8) & 0x0ff;
1246 }
1247
1248 /* read PROM address and compare with CSR address */
1249 for (i = 0; i < 6; i++)
1250 promaddr[i] = inb(ioaddr + i);
1251
1252 if (memcmp(promaddr, dev->dev_addr, 6)
1253 || !is_valid_ether_addr(dev->dev_addr)) {
1254 if (is_valid_ether_addr(promaddr)) {
1255 if (pcnet32_debug & NETIF_MSG_PROBE) {
1256 printk(" warning: CSR address invalid,\n");
1257 printk(KERN_INFO " using instead PROM address of");
1258 }
1259 memcpy(dev->dev_addr, promaddr, 6);
1260 }
1261 }
1262 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
1263
1264 /* if the ethernet address is not valid, force to 00:00:00:00:00:00 */
1265 if (!is_valid_ether_addr(dev->perm_addr))
1266 memset(dev->dev_addr, 0, sizeof(dev->dev_addr));
1267
1268 if (pcnet32_debug & NETIF_MSG_PROBE) {
1269 for (i = 0; i < 6; i++) 1182 for (i = 0; i < 6; i++)
1270 printk(" %2.2x", dev->dev_addr[i]); 1183 promaddr[i] = inb(ioaddr + i);
1271 1184
1272 /* Version 0x2623 and 0x2624 */ 1185 if (memcmp(promaddr, dev->dev_addr, 6)
1273 if (((chip_version + 1) & 0xfffe) == 0x2624) { 1186 || !is_valid_ether_addr(dev->dev_addr)) {
1274 i = a->read_csr(ioaddr, 80) & 0x0C00; /* Check tx_start_pt */ 1187 if (is_valid_ether_addr(promaddr)) {
1275 printk("\n" KERN_INFO " tx_start_pt(0x%04x):",i); 1188 if (pcnet32_debug & NETIF_MSG_PROBE) {
1276 switch(i>>10) { 1189 printk(" warning: CSR address invalid,\n");
1277 case 0: printk(" 20 bytes,"); break; 1190 printk(KERN_INFO
1278 case 1: printk(" 64 bytes,"); break; 1191 " using instead PROM address of");
1279 case 2: printk(" 128 bytes,"); break; 1192 }
1280 case 3: printk("~220 bytes,"); break; 1193 memcpy(dev->dev_addr, promaddr, 6);
1281 } 1194 }
1282 i = a->read_bcr(ioaddr, 18); /* Check Burst/Bus control */ 1195 }
1283 printk(" BCR18(%x):",i&0xffff); 1196 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
1284 if (i & (1<<5)) printk("BurstWrEn "); 1197
1285 if (i & (1<<6)) printk("BurstRdEn "); 1198 /* if the ethernet address is not valid, force to 00:00:00:00:00:00 */
1286 if (i & (1<<7)) printk("DWordIO "); 1199 if (!is_valid_ether_addr(dev->perm_addr))
1287 if (i & (1<<11)) printk("NoUFlow "); 1200 memset(dev->dev_addr, 0, sizeof(dev->dev_addr));
1288 i = a->read_bcr(ioaddr, 25); 1201
1289 printk("\n" KERN_INFO " SRAMSIZE=0x%04x,",i<<8); 1202 if (pcnet32_debug & NETIF_MSG_PROBE) {
1290 i = a->read_bcr(ioaddr, 26); 1203 for (i = 0; i < 6; i++)
1291 printk(" SRAM_BND=0x%04x,",i<<8); 1204 printk(" %2.2x", dev->dev_addr[i]);
1292 i = a->read_bcr(ioaddr, 27); 1205
1293 if (i & (1<<14)) printk("LowLatRx"); 1206 /* Version 0x2623 and 0x2624 */
1294 } 1207 if (((chip_version + 1) & 0xfffe) == 0x2624) {
1295 } 1208 i = a->read_csr(ioaddr, 80) & 0x0C00; /* Check tx_start_pt */
1296 1209 printk("\n" KERN_INFO " tx_start_pt(0x%04x):", i);
1297 dev->base_addr = ioaddr; 1210 switch (i >> 10) {
1298 /* pci_alloc_consistent returns page-aligned memory, so we do not have to check the alignment */ 1211 case 0:
1299 if ((lp = pci_alloc_consistent(pdev, sizeof(*lp), &lp_dma_addr)) == NULL) { 1212 printk(" 20 bytes,");
1300 if (pcnet32_debug & NETIF_MSG_PROBE) 1213 break;
1301 printk(KERN_ERR PFX "Consistent memory allocation failed.\n"); 1214 case 1:
1302 ret = -ENOMEM; 1215 printk(" 64 bytes,");
1303 goto err_free_netdev; 1216 break;
1304 } 1217 case 2:
1305 1218 printk(" 128 bytes,");
1306 memset(lp, 0, sizeof(*lp)); 1219 break;
1307 lp->dma_addr = lp_dma_addr; 1220 case 3:
1308 lp->pci_dev = pdev; 1221 printk("~220 bytes,");
1309 1222 break;
1310 spin_lock_init(&lp->lock); 1223 }
1311 1224 i = a->read_bcr(ioaddr, 18); /* Check Burst/Bus control */
1312 SET_MODULE_OWNER(dev); 1225 printk(" BCR18(%x):", i & 0xffff);
1313 SET_NETDEV_DEV(dev, &pdev->dev); 1226 if (i & (1 << 5))
1314 dev->priv = lp; 1227 printk("BurstWrEn ");
1315 lp->name = chipname; 1228 if (i & (1 << 6))
1316 lp->shared_irq = shared; 1229 printk("BurstRdEn ");
1317 lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */ 1230 if (i & (1 << 7))
1318 lp->rx_ring_size = RX_RING_SIZE; /* default rx ring size */ 1231 printk("DWordIO ");
1319 lp->tx_mod_mask = lp->tx_ring_size - 1; 1232 if (i & (1 << 11))
1320 lp->rx_mod_mask = lp->rx_ring_size - 1; 1233 printk("NoUFlow ");
1321 lp->tx_len_bits = (PCNET32_LOG_TX_BUFFERS << 12); 1234 i = a->read_bcr(ioaddr, 25);
1322 lp->rx_len_bits = (PCNET32_LOG_RX_BUFFERS << 4); 1235 printk("\n" KERN_INFO " SRAMSIZE=0x%04x,", i << 8);
1323 lp->mii_if.full_duplex = fdx; 1236 i = a->read_bcr(ioaddr, 26);
1324 lp->mii_if.phy_id_mask = 0x1f; 1237 printk(" SRAM_BND=0x%04x,", i << 8);
1325 lp->mii_if.reg_num_mask = 0x1f; 1238 i = a->read_bcr(ioaddr, 27);
1326 lp->dxsuflo = dxsuflo; 1239 if (i & (1 << 14))
1327 lp->mii = mii; 1240 printk("LowLatRx");
1328 lp->msg_enable = pcnet32_debug; 1241 }
1329 if ((cards_found >= MAX_UNITS) || (options[cards_found] > sizeof(options_mapping))) 1242 }
1330 lp->options = PCNET32_PORT_ASEL; 1243
1331 else 1244 dev->base_addr = ioaddr;
1332 lp->options = options_mapping[options[cards_found]]; 1245 /* pci_alloc_consistent returns page-aligned memory, so we do not have to check the alignment */
1333 lp->mii_if.dev = dev; 1246 if ((lp =
1334 lp->mii_if.mdio_read = mdio_read; 1247 pci_alloc_consistent(pdev, sizeof(*lp), &lp_dma_addr)) == NULL) {
1335 lp->mii_if.mdio_write = mdio_write; 1248 if (pcnet32_debug & NETIF_MSG_PROBE)
1336 1249 printk(KERN_ERR PFX
1337 if (fdx && !(lp->options & PCNET32_PORT_ASEL) && 1250 "Consistent memory allocation failed.\n");
1338 ((cards_found>=MAX_UNITS) || full_duplex[cards_found])) 1251 ret = -ENOMEM;
1339 lp->options |= PCNET32_PORT_FD; 1252 goto err_free_netdev;
1340 1253 }
1341 if (!a) { 1254
1342 if (pcnet32_debug & NETIF_MSG_PROBE) 1255 memset(lp, 0, sizeof(*lp));
1343 printk(KERN_ERR PFX "No access methods\n"); 1256 lp->dma_addr = lp_dma_addr;
1344 ret = -ENODEV; 1257 lp->pci_dev = pdev;
1345 goto err_free_consistent; 1258
1346 } 1259 spin_lock_init(&lp->lock);
1347 lp->a = *a; 1260
1348 1261 SET_MODULE_OWNER(dev);
1349 /* prior to register_netdev, dev->name is not yet correct */ 1262 SET_NETDEV_DEV(dev, &pdev->dev);
1350 if (pcnet32_alloc_ring(dev, pci_name(lp->pci_dev))) { 1263 dev->priv = lp;
1351 ret = -ENOMEM; 1264 lp->name = chipname;
1352 goto err_free_ring; 1265 lp->shared_irq = shared;
1353 } 1266 lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */
1354 /* detect special T1/E1 WAN card by checking for MAC address */ 1267 lp->rx_ring_size = RX_RING_SIZE; /* default rx ring size */
1355 if (dev->dev_addr[0] == 0x00 && dev->dev_addr[1] == 0xe0 1268 lp->tx_mod_mask = lp->tx_ring_size - 1;
1269 lp->rx_mod_mask = lp->rx_ring_size - 1;
1270 lp->tx_len_bits = (PCNET32_LOG_TX_BUFFERS << 12);
1271 lp->rx_len_bits = (PCNET32_LOG_RX_BUFFERS << 4);
1272 lp->mii_if.full_duplex = fdx;
1273 lp->mii_if.phy_id_mask = 0x1f;
1274 lp->mii_if.reg_num_mask = 0x1f;
1275 lp->dxsuflo = dxsuflo;
1276 lp->mii = mii;
1277 lp->msg_enable = pcnet32_debug;
1278 if ((cards_found >= MAX_UNITS)
1279 || (options[cards_found] > sizeof(options_mapping)))
1280 lp->options = PCNET32_PORT_ASEL;
1281 else
1282 lp->options = options_mapping[options[cards_found]];
1283 lp->mii_if.dev = dev;
1284 lp->mii_if.mdio_read = mdio_read;
1285 lp->mii_if.mdio_write = mdio_write;
1286
1287 if (fdx && !(lp->options & PCNET32_PORT_ASEL) &&
1288 ((cards_found >= MAX_UNITS) || full_duplex[cards_found]))
1289 lp->options |= PCNET32_PORT_FD;
1290
1291 if (!a) {
1292 if (pcnet32_debug & NETIF_MSG_PROBE)
1293 printk(KERN_ERR PFX "No access methods\n");
1294 ret = -ENODEV;
1295 goto err_free_consistent;
1296 }
1297 lp->a = *a;
1298
1299 /* prior to register_netdev, dev->name is not yet correct */
1300 if (pcnet32_alloc_ring(dev, pci_name(lp->pci_dev))) {
1301 ret = -ENOMEM;
1302 goto err_free_ring;
1303 }
1304 /* detect special T1/E1 WAN card by checking for MAC address */
1305 if (dev->dev_addr[0] == 0x00 && dev->dev_addr[1] == 0xe0
1356 && dev->dev_addr[2] == 0x75) 1306 && dev->dev_addr[2] == 0x75)
1357 lp->options = PCNET32_PORT_FD | PCNET32_PORT_GPSI; 1307 lp->options = PCNET32_PORT_FD | PCNET32_PORT_GPSI;
1358
1359 lp->init_block.mode = le16_to_cpu(0x0003); /* Disable Rx and Tx. */
1360 lp->init_block.tlen_rlen = le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
1361 for (i = 0; i < 6; i++)
1362 lp->init_block.phys_addr[i] = dev->dev_addr[i];
1363 lp->init_block.filter[0] = 0x00000000;
1364 lp->init_block.filter[1] = 0x00000000;
1365 lp->init_block.rx_ring = (u32)le32_to_cpu(lp->rx_ring_dma_addr);
1366 lp->init_block.tx_ring = (u32)le32_to_cpu(lp->tx_ring_dma_addr);
1367
1368 /* switch pcnet32 to 32bit mode */
1369 a->write_bcr(ioaddr, 20, 2);
1370
1371 a->write_csr(ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private,
1372 init_block)) & 0xffff);
1373 a->write_csr(ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private,
1374 init_block)) >> 16);
1375
1376 if (pdev) { /* use the IRQ provided by PCI */
1377 dev->irq = pdev->irq;
1378 if (pcnet32_debug & NETIF_MSG_PROBE)
1379 printk(" assigned IRQ %d.\n", dev->irq);
1380 } else {
1381 unsigned long irq_mask = probe_irq_on();
1382 1308
1383 /* 1309 lp->init_block.mode = le16_to_cpu(0x0003); /* Disable Rx and Tx. */
1384 * To auto-IRQ we enable the initialization-done and DMA error 1310 lp->init_block.tlen_rlen =
1385 * interrupts. For ISA boards we get a DMA error, but VLB and PCI 1311 le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
1386 * boards will work. 1312 for (i = 0; i < 6; i++)
1387 */ 1313 lp->init_block.phys_addr[i] = dev->dev_addr[i];
1388 /* Trigger an initialization just for the interrupt. */ 1314 lp->init_block.filter[0] = 0x00000000;
1389 a->write_csr (ioaddr, 0, 0x41); 1315 lp->init_block.filter[1] = 0x00000000;
1390 mdelay (1); 1316 lp->init_block.rx_ring = (u32) le32_to_cpu(lp->rx_ring_dma_addr);
1317 lp->init_block.tx_ring = (u32) le32_to_cpu(lp->tx_ring_dma_addr);
1318
1319 /* switch pcnet32 to 32bit mode */
1320 a->write_bcr(ioaddr, 20, 2);
1321
1322 a->write_csr(ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private,
1323 init_block)) & 0xffff);
1324 a->write_csr(ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private,
1325 init_block)) >> 16);
1326
1327 if (pdev) { /* use the IRQ provided by PCI */
1328 dev->irq = pdev->irq;
1329 if (pcnet32_debug & NETIF_MSG_PROBE)
1330 printk(" assigned IRQ %d.\n", dev->irq);
1331 } else {
1332 unsigned long irq_mask = probe_irq_on();
1333
1334 /*
1335 * To auto-IRQ we enable the initialization-done and DMA error
1336 * interrupts. For ISA boards we get a DMA error, but VLB and PCI
1337 * boards will work.
1338 */
1339 /* Trigger an initialization just for the interrupt. */
1340 a->write_csr(ioaddr, 0, 0x41);
1341 mdelay(1);
1342
1343 dev->irq = probe_irq_off(irq_mask);
1344 if (!dev->irq) {
1345 if (pcnet32_debug & NETIF_MSG_PROBE)
1346 printk(", failed to detect IRQ line.\n");
1347 ret = -ENODEV;
1348 goto err_free_ring;
1349 }
1350 if (pcnet32_debug & NETIF_MSG_PROBE)
1351 printk(", probed IRQ %d.\n", dev->irq);
1352 }
1391 1353
1392 dev->irq = probe_irq_off (irq_mask); 1354 /* Set the mii phy_id so that we can query the link state */
1393 if (!dev->irq) { 1355 if (lp->mii) {
1394 if (pcnet32_debug & NETIF_MSG_PROBE) 1356 /* lp->phycount and lp->phymask are set to 0 by memset above */
1395 printk(", failed to detect IRQ line.\n"); 1357
1396 ret = -ENODEV; 1358 lp->mii_if.phy_id = ((lp->a.read_bcr(ioaddr, 33)) >> 5) & 0x1f;
1397 goto err_free_ring; 1359 /* scan for PHYs */
1360 for (i = 0; i < PCNET32_MAX_PHYS; i++) {
1361 unsigned short id1, id2;
1362
1363 id1 = mdio_read(dev, i, MII_PHYSID1);
1364 if (id1 == 0xffff)
1365 continue;
1366 id2 = mdio_read(dev, i, MII_PHYSID2);
1367 if (id2 == 0xffff)
1368 continue;
1369 if (i == 31 && ((chip_version + 1) & 0xfffe) == 0x2624)
1370 continue; /* 79C971 & 79C972 have phantom phy at id 31 */
1371 lp->phycount++;
1372 lp->phymask |= (1 << i);
1373 lp->mii_if.phy_id = i;
1374 if (pcnet32_debug & NETIF_MSG_PROBE)
1375 printk(KERN_INFO PFX
1376 "Found PHY %04x:%04x at address %d.\n",
1377 id1, id2, i);
1378 }
1379 lp->a.write_bcr(ioaddr, 33, (lp->mii_if.phy_id) << 5);
1380 if (lp->phycount > 1) {
1381 lp->options |= PCNET32_PORT_MII;
1382 }
1398 } 1383 }
1399 if (pcnet32_debug & NETIF_MSG_PROBE) 1384
1400 printk(", probed IRQ %d.\n", dev->irq); 1385 init_timer(&lp->watchdog_timer);
1401 } 1386 lp->watchdog_timer.data = (unsigned long)dev;
1402 1387 lp->watchdog_timer.function = (void *)&pcnet32_watchdog;
1403 /* Set the mii phy_id so that we can query the link state */ 1388
1404 if (lp->mii) 1389 /* The PCNET32-specific entries in the device structure. */
1405 lp->mii_if.phy_id = ((lp->a.read_bcr (ioaddr, 33)) >> 5) & 0x1f; 1390 dev->open = &pcnet32_open;
1406 1391 dev->hard_start_xmit = &pcnet32_start_xmit;
1407 init_timer (&lp->watchdog_timer); 1392 dev->stop = &pcnet32_close;
1408 lp->watchdog_timer.data = (unsigned long) dev; 1393 dev->get_stats = &pcnet32_get_stats;
1409 lp->watchdog_timer.function = (void *) &pcnet32_watchdog; 1394 dev->set_multicast_list = &pcnet32_set_multicast_list;
1410 1395 dev->do_ioctl = &pcnet32_ioctl;
1411 /* The PCNET32-specific entries in the device structure. */ 1396 dev->ethtool_ops = &pcnet32_ethtool_ops;
1412 dev->open = &pcnet32_open; 1397 dev->tx_timeout = pcnet32_tx_timeout;
1413 dev->hard_start_xmit = &pcnet32_start_xmit; 1398 dev->watchdog_timeo = (5 * HZ);
1414 dev->stop = &pcnet32_close;
1415 dev->get_stats = &pcnet32_get_stats;
1416 dev->set_multicast_list = &pcnet32_set_multicast_list;
1417 dev->do_ioctl = &pcnet32_ioctl;
1418 dev->ethtool_ops = &pcnet32_ethtool_ops;
1419 dev->tx_timeout = pcnet32_tx_timeout;
1420 dev->watchdog_timeo = (5*HZ);
1421 1399
1422#ifdef CONFIG_NET_POLL_CONTROLLER 1400#ifdef CONFIG_NET_POLL_CONTROLLER
1423 dev->poll_controller = pcnet32_poll_controller; 1401 dev->poll_controller = pcnet32_poll_controller;
1424#endif 1402#endif
1425 1403
1426 /* Fill in the generic fields of the device structure. */ 1404 /* Fill in the generic fields of the device structure. */
1427 if (register_netdev(dev)) 1405 if (register_netdev(dev))
1428 goto err_free_ring; 1406 goto err_free_ring;
1429 1407
1430 if (pdev) { 1408 if (pdev) {
1431 pci_set_drvdata(pdev, dev); 1409 pci_set_drvdata(pdev, dev);
1432 } else { 1410 } else {
1433 lp->next = pcnet32_dev; 1411 lp->next = pcnet32_dev;
1434 pcnet32_dev = dev; 1412 pcnet32_dev = dev;
1435 } 1413 }
1436
1437 if (pcnet32_debug & NETIF_MSG_PROBE)
1438 printk(KERN_INFO "%s: registered as %s\n", dev->name, lp->name);
1439 cards_found++;
1440
1441 /* enable LED writes */
1442 a->write_bcr(ioaddr, 2, a->read_bcr(ioaddr, 2) | 0x1000);
1443
1444 return 0;
1445
1446err_free_ring:
1447 pcnet32_free_ring(dev);
1448err_free_consistent:
1449 pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
1450err_free_netdev:
1451 free_netdev(dev);
1452err_release_region:
1453 release_region(ioaddr, PCNET32_TOTAL_SIZE);
1454 return ret;
1455}
1456 1414
1415 if (pcnet32_debug & NETIF_MSG_PROBE)
1416 printk(KERN_INFO "%s: registered as %s\n", dev->name, lp->name);
1417 cards_found++;
1418
1419 /* enable LED writes */
1420 a->write_bcr(ioaddr, 2, a->read_bcr(ioaddr, 2) | 0x1000);
1421
1422 return 0;
1423
1424 err_free_ring:
1425 pcnet32_free_ring(dev);
1426 err_free_consistent:
1427 pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
1428 err_free_netdev:
1429 free_netdev(dev);
1430 err_release_region:
1431 release_region(ioaddr, PCNET32_TOTAL_SIZE);
1432 return ret;
1433}
1457 1434
1458/* if any allocation fails, caller must also call pcnet32_free_ring */ 1435/* if any allocation fails, caller must also call pcnet32_free_ring */
1459static int pcnet32_alloc_ring(struct net_device *dev, char *name) 1436static int pcnet32_alloc_ring(struct net_device *dev, char *name)
1460{ 1437{
1461 struct pcnet32_private *lp = dev->priv; 1438 struct pcnet32_private *lp = dev->priv;
1462 1439
1463 lp->tx_ring = pci_alloc_consistent(lp->pci_dev, 1440 lp->tx_ring = pci_alloc_consistent(lp->pci_dev,
1464 sizeof(struct pcnet32_tx_head) * lp->tx_ring_size, 1441 sizeof(struct pcnet32_tx_head) *
1465 &lp->tx_ring_dma_addr); 1442 lp->tx_ring_size,
1466 if (lp->tx_ring == NULL) { 1443 &lp->tx_ring_dma_addr);
1467 if (pcnet32_debug & NETIF_MSG_DRV) 1444 if (lp->tx_ring == NULL) {
1468 printk("\n" KERN_ERR PFX "%s: Consistent memory allocation failed.\n", 1445 if (pcnet32_debug & NETIF_MSG_DRV)
1469 name); 1446 printk("\n" KERN_ERR PFX
1470 return -ENOMEM; 1447 "%s: Consistent memory allocation failed.\n",
1471 } 1448 name);
1472 1449 return -ENOMEM;
1473 lp->rx_ring = pci_alloc_consistent(lp->pci_dev, 1450 }
1474 sizeof(struct pcnet32_rx_head) * lp->rx_ring_size,
1475 &lp->rx_ring_dma_addr);
1476 if (lp->rx_ring == NULL) {
1477 if (pcnet32_debug & NETIF_MSG_DRV)
1478 printk("\n" KERN_ERR PFX "%s: Consistent memory allocation failed.\n",
1479 name);
1480 return -ENOMEM;
1481 }
1482
1483 lp->tx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->tx_ring_size,
1484 GFP_ATOMIC);
1485 if (!lp->tx_dma_addr) {
1486 if (pcnet32_debug & NETIF_MSG_DRV)
1487 printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
1488 return -ENOMEM;
1489 }
1490 memset(lp->tx_dma_addr, 0, sizeof(dma_addr_t) * lp->tx_ring_size);
1491
1492 lp->rx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->rx_ring_size,
1493 GFP_ATOMIC);
1494 if (!lp->rx_dma_addr) {
1495 if (pcnet32_debug & NETIF_MSG_DRV)
1496 printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
1497 return -ENOMEM;
1498 }
1499 memset(lp->rx_dma_addr, 0, sizeof(dma_addr_t) * lp->rx_ring_size);
1500
1501 lp->tx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->tx_ring_size,
1502 GFP_ATOMIC);
1503 if (!lp->tx_skbuff) {
1504 if (pcnet32_debug & NETIF_MSG_DRV)
1505 printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
1506 return -ENOMEM;
1507 }
1508 memset(lp->tx_skbuff, 0, sizeof(struct sk_buff *) * lp->tx_ring_size);
1509
1510 lp->rx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->rx_ring_size,
1511 GFP_ATOMIC);
1512 if (!lp->rx_skbuff) {
1513 if (pcnet32_debug & NETIF_MSG_DRV)
1514 printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
1515 return -ENOMEM;
1516 }
1517 memset(lp->rx_skbuff, 0, sizeof(struct sk_buff *) * lp->rx_ring_size);
1518 1451
1519 return 0; 1452 lp->rx_ring = pci_alloc_consistent(lp->pci_dev,
1520} 1453 sizeof(struct pcnet32_rx_head) *
1454 lp->rx_ring_size,
1455 &lp->rx_ring_dma_addr);
1456 if (lp->rx_ring == NULL) {
1457 if (pcnet32_debug & NETIF_MSG_DRV)
1458 printk("\n" KERN_ERR PFX
1459 "%s: Consistent memory allocation failed.\n",
1460 name);
1461 return -ENOMEM;
1462 }
1521 1463
1464 lp->tx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->tx_ring_size,
1465 GFP_ATOMIC);
1466 if (!lp->tx_dma_addr) {
1467 if (pcnet32_debug & NETIF_MSG_DRV)
1468 printk("\n" KERN_ERR PFX
1469 "%s: Memory allocation failed.\n", name);
1470 return -ENOMEM;
1471 }
1472 memset(lp->tx_dma_addr, 0, sizeof(dma_addr_t) * lp->tx_ring_size);
1473
1474 lp->rx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->rx_ring_size,
1475 GFP_ATOMIC);
1476 if (!lp->rx_dma_addr) {
1477 if (pcnet32_debug & NETIF_MSG_DRV)
1478 printk("\n" KERN_ERR PFX
1479 "%s: Memory allocation failed.\n", name);
1480 return -ENOMEM;
1481 }
1482 memset(lp->rx_dma_addr, 0, sizeof(dma_addr_t) * lp->rx_ring_size);
1483
1484 lp->tx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->tx_ring_size,
1485 GFP_ATOMIC);
1486 if (!lp->tx_skbuff) {
1487 if (pcnet32_debug & NETIF_MSG_DRV)
1488 printk("\n" KERN_ERR PFX
1489 "%s: Memory allocation failed.\n", name);
1490 return -ENOMEM;
1491 }
1492 memset(lp->tx_skbuff, 0, sizeof(struct sk_buff *) * lp->tx_ring_size);
1493
1494 lp->rx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->rx_ring_size,
1495 GFP_ATOMIC);
1496 if (!lp->rx_skbuff) {
1497 if (pcnet32_debug & NETIF_MSG_DRV)
1498 printk("\n" KERN_ERR PFX
1499 "%s: Memory allocation failed.\n", name);
1500 return -ENOMEM;
1501 }
1502 memset(lp->rx_skbuff, 0, sizeof(struct sk_buff *) * lp->rx_ring_size);
1503
1504 return 0;
1505}
1522 1506
1523static void pcnet32_free_ring(struct net_device *dev) 1507static void pcnet32_free_ring(struct net_device *dev)
1524{ 1508{
1525 struct pcnet32_private *lp = dev->priv; 1509 struct pcnet32_private *lp = dev->priv;
1526 1510
1527 kfree(lp->tx_skbuff); 1511 kfree(lp->tx_skbuff);
1528 lp->tx_skbuff = NULL; 1512 lp->tx_skbuff = NULL;
1529 1513
1530 kfree(lp->rx_skbuff); 1514 kfree(lp->rx_skbuff);
1531 lp->rx_skbuff = NULL; 1515 lp->rx_skbuff = NULL;
1532 1516
1533 kfree(lp->tx_dma_addr); 1517 kfree(lp->tx_dma_addr);
1534 lp->tx_dma_addr = NULL; 1518 lp->tx_dma_addr = NULL;
1535 1519
1536 kfree(lp->rx_dma_addr); 1520 kfree(lp->rx_dma_addr);
1537 lp->rx_dma_addr = NULL; 1521 lp->rx_dma_addr = NULL;
1538 1522
1539 if (lp->tx_ring) { 1523 if (lp->tx_ring) {
1540 pci_free_consistent(lp->pci_dev, sizeof(struct pcnet32_tx_head) * lp->tx_ring_size, 1524 pci_free_consistent(lp->pci_dev,
1541 lp->tx_ring, lp->tx_ring_dma_addr); 1525 sizeof(struct pcnet32_tx_head) *
1542 lp->tx_ring = NULL; 1526 lp->tx_ring_size, lp->tx_ring,
1543 } 1527 lp->tx_ring_dma_addr);
1528 lp->tx_ring = NULL;
1529 }
1544 1530
1545 if (lp->rx_ring) { 1531 if (lp->rx_ring) {
1546 pci_free_consistent(lp->pci_dev, sizeof(struct pcnet32_rx_head) * lp->rx_ring_size, 1532 pci_free_consistent(lp->pci_dev,
1547 lp->rx_ring, lp->rx_ring_dma_addr); 1533 sizeof(struct pcnet32_rx_head) *
1548 lp->rx_ring = NULL; 1534 lp->rx_ring_size, lp->rx_ring,
1549 } 1535 lp->rx_ring_dma_addr);
1536 lp->rx_ring = NULL;
1537 }
1550} 1538}
1551 1539
1552 1540static int pcnet32_open(struct net_device *dev)
1553static int
1554pcnet32_open(struct net_device *dev)
1555{ 1541{
1556 struct pcnet32_private *lp = dev->priv; 1542 struct pcnet32_private *lp = dev->priv;
1557 unsigned long ioaddr = dev->base_addr; 1543 unsigned long ioaddr = dev->base_addr;
1558 u16 val; 1544 u16 val;
1559 int i; 1545 int i;
1560 int rc; 1546 int rc;
1561 unsigned long flags; 1547 unsigned long flags;
1562 1548
1563 if (request_irq(dev->irq, &pcnet32_interrupt, 1549 if (request_irq(dev->irq, &pcnet32_interrupt,
1564 lp->shared_irq ? SA_SHIRQ : 0, dev->name, (void *)dev)) { 1550 lp->shared_irq ? SA_SHIRQ : 0, dev->name,
1565 return -EAGAIN; 1551 (void *)dev)) {
1566 } 1552 return -EAGAIN;
1567 1553 }
1568 spin_lock_irqsave(&lp->lock, flags); 1554
1569 /* Check for a valid station address */ 1555 spin_lock_irqsave(&lp->lock, flags);
1570 if (!is_valid_ether_addr(dev->dev_addr)) { 1556 /* Check for a valid station address */
1571 rc = -EINVAL; 1557 if (!is_valid_ether_addr(dev->dev_addr)) {
1572 goto err_free_irq; 1558 rc = -EINVAL;
1573 } 1559 goto err_free_irq;
1574 1560 }
1575 /* Reset the PCNET32 */ 1561
1576 lp->a.reset (ioaddr); 1562 /* Reset the PCNET32 */
1577 1563 lp->a.reset(ioaddr);
1578 /* switch pcnet32 to 32bit mode */ 1564
1579 lp->a.write_bcr (ioaddr, 20, 2); 1565 /* switch pcnet32 to 32bit mode */
1580 1566 lp->a.write_bcr(ioaddr, 20, 2);
1581 if (netif_msg_ifup(lp)) 1567
1582 printk(KERN_DEBUG "%s: pcnet32_open() irq %d tx/rx rings %#x/%#x init %#x.\n", 1568 if (netif_msg_ifup(lp))
1583 dev->name, dev->irq, 1569 printk(KERN_DEBUG
1584 (u32) (lp->tx_ring_dma_addr), 1570 "%s: pcnet32_open() irq %d tx/rx rings %#x/%#x init %#x.\n",
1585 (u32) (lp->rx_ring_dma_addr), 1571 dev->name, dev->irq, (u32) (lp->tx_ring_dma_addr),
1586 (u32) (lp->dma_addr + offsetof(struct pcnet32_private, init_block))); 1572 (u32) (lp->rx_ring_dma_addr),
1587 1573 (u32) (lp->dma_addr +
1588 /* set/reset autoselect bit */ 1574 offsetof(struct pcnet32_private, init_block)));
1589 val = lp->a.read_bcr (ioaddr, 2) & ~2; 1575
1590 if (lp->options & PCNET32_PORT_ASEL) 1576 /* set/reset autoselect bit */
1591 val |= 2; 1577 val = lp->a.read_bcr(ioaddr, 2) & ~2;
1592 lp->a.write_bcr (ioaddr, 2, val); 1578 if (lp->options & PCNET32_PORT_ASEL)
1593
1594 /* handle full duplex setting */
1595 if (lp->mii_if.full_duplex) {
1596 val = lp->a.read_bcr (ioaddr, 9) & ~3;
1597 if (lp->options & PCNET32_PORT_FD) {
1598 val |= 1;
1599 if (lp->options == (PCNET32_PORT_FD | PCNET32_PORT_AUI))
1600 val |= 2; 1579 val |= 2;
1601 } else if (lp->options & PCNET32_PORT_ASEL) { 1580 lp->a.write_bcr(ioaddr, 2, val);
1602 /* workaround of xSeries250, turn on for 79C975 only */ 1581
1603 i = ((lp->a.read_csr(ioaddr, 88) | 1582 /* handle full duplex setting */
1604 (lp->a.read_csr(ioaddr,89) << 16)) >> 12) & 0xffff; 1583 if (lp->mii_if.full_duplex) {
1605 if (i == 0x2627) 1584 val = lp->a.read_bcr(ioaddr, 9) & ~3;
1606 val |= 3; 1585 if (lp->options & PCNET32_PORT_FD) {
1607 } 1586 val |= 1;
1608 lp->a.write_bcr (ioaddr, 9, val); 1587 if (lp->options == (PCNET32_PORT_FD | PCNET32_PORT_AUI))
1609 } 1588 val |= 2;
1610 1589 } else if (lp->options & PCNET32_PORT_ASEL) {
1611 /* set/reset GPSI bit in test register */ 1590 /* workaround of xSeries250, turn on for 79C975 only */
1612 val = lp->a.read_csr (ioaddr, 124) & ~0x10; 1591 i = ((lp->a.read_csr(ioaddr, 88) |
1613 if ((lp->options & PCNET32_PORT_PORTSEL) == PCNET32_PORT_GPSI) 1592 (lp->a.
1614 val |= 0x10; 1593 read_csr(ioaddr, 89) << 16)) >> 12) & 0xffff;
1615 lp->a.write_csr (ioaddr, 124, val); 1594 if (i == 0x2627)
1616 1595 val |= 3;
1617 /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */ 1596 }
1618 if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT && 1597 lp->a.write_bcr(ioaddr, 9, val);
1598 }
1599
1600 /* set/reset GPSI bit in test register */
1601 val = lp->a.read_csr(ioaddr, 124) & ~0x10;
1602 if ((lp->options & PCNET32_PORT_PORTSEL) == PCNET32_PORT_GPSI)
1603 val |= 0x10;
1604 lp->a.write_csr(ioaddr, 124, val);
1605
1606 /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */
1607 if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT &&
1619 (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || 1608 (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX ||
1620 lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { 1609 lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) {
1621 if (lp->options & PCNET32_PORT_ASEL) { 1610 if (lp->options & PCNET32_PORT_ASEL) {
1622 lp->options = PCNET32_PORT_FD | PCNET32_PORT_100; 1611 lp->options = PCNET32_PORT_FD | PCNET32_PORT_100;
1623 if (netif_msg_link(lp)) 1612 if (netif_msg_link(lp))
1624 printk(KERN_DEBUG "%s: Setting 100Mb-Full Duplex.\n", 1613 printk(KERN_DEBUG
1625 dev->name); 1614 "%s: Setting 100Mb-Full Duplex.\n",
1626 } 1615 dev->name);
1627 } 1616 }
1628 { 1617 }
1629 /* 1618 if (lp->phycount < 2) {
1630 * 24 Jun 2004 according AMD, in order to change the PHY, 1619 /*
1631 * DANAS (or DISPM for 79C976) must be set; then select the speed, 1620 * 24 Jun 2004 according AMD, in order to change the PHY,
1632 * duplex, and/or enable auto negotiation, and clear DANAS 1621 * DANAS (or DISPM for 79C976) must be set; then select the speed,
1633 */ 1622 * duplex, and/or enable auto negotiation, and clear DANAS
1634 if (lp->mii && !(lp->options & PCNET32_PORT_ASEL)) { 1623 */
1635 lp->a.write_bcr(ioaddr, 32, 1624 if (lp->mii && !(lp->options & PCNET32_PORT_ASEL)) {
1636 lp->a.read_bcr(ioaddr, 32) | 0x0080); 1625 lp->a.write_bcr(ioaddr, 32,
1637 /* disable Auto Negotiation, set 10Mpbs, HD */ 1626 lp->a.read_bcr(ioaddr, 32) | 0x0080);
1638 val = lp->a.read_bcr(ioaddr, 32) & ~0xb8; 1627 /* disable Auto Negotiation, set 10Mpbs, HD */
1639 if (lp->options & PCNET32_PORT_FD) 1628 val = lp->a.read_bcr(ioaddr, 32) & ~0xb8;
1640 val |= 0x10; 1629 if (lp->options & PCNET32_PORT_FD)
1641 if (lp->options & PCNET32_PORT_100) 1630 val |= 0x10;
1642 val |= 0x08; 1631 if (lp->options & PCNET32_PORT_100)
1643 lp->a.write_bcr (ioaddr, 32, val); 1632 val |= 0x08;
1633 lp->a.write_bcr(ioaddr, 32, val);
1634 } else {
1635 if (lp->options & PCNET32_PORT_ASEL) {
1636 lp->a.write_bcr(ioaddr, 32,
1637 lp->a.read_bcr(ioaddr,
1638 32) | 0x0080);
1639 /* enable auto negotiate, setup, disable fd */
1640 val = lp->a.read_bcr(ioaddr, 32) & ~0x98;
1641 val |= 0x20;
1642 lp->a.write_bcr(ioaddr, 32, val);
1643 }
1644 }
1644 } else { 1645 } else {
1645 if (lp->options & PCNET32_PORT_ASEL) { 1646 int first_phy = -1;
1646 lp->a.write_bcr(ioaddr, 32, 1647 u16 bmcr;
1647 lp->a.read_bcr(ioaddr, 32) | 0x0080); 1648 u32 bcr9;
1648 /* enable auto negotiate, setup, disable fd */ 1649 struct ethtool_cmd ecmd;
1649 val = lp->a.read_bcr(ioaddr, 32) & ~0x98; 1650
1650 val |= 0x20; 1651 /*
1651 lp->a.write_bcr(ioaddr, 32, val); 1652 * There is really no good other way to handle multiple PHYs
1652 } 1653 * other than turning off all automatics
1654 */
1655 val = lp->a.read_bcr(ioaddr, 2);
1656 lp->a.write_bcr(ioaddr, 2, val & ~2);
1657 val = lp->a.read_bcr(ioaddr, 32);
1658 lp->a.write_bcr(ioaddr, 32, val & ~(1 << 7)); /* stop MII manager */
1659
1660 if (!(lp->options & PCNET32_PORT_ASEL)) {
1661 /* setup ecmd */
1662 ecmd.port = PORT_MII;
1663 ecmd.transceiver = XCVR_INTERNAL;
1664 ecmd.autoneg = AUTONEG_DISABLE;
1665 ecmd.speed =
1666 lp->
1667 options & PCNET32_PORT_100 ? SPEED_100 : SPEED_10;
1668 bcr9 = lp->a.read_bcr(ioaddr, 9);
1669
1670 if (lp->options & PCNET32_PORT_FD) {
1671 ecmd.duplex = DUPLEX_FULL;
1672 bcr9 |= (1 << 0);
1673 } else {
1674 ecmd.duplex = DUPLEX_HALF;
1675 bcr9 |= ~(1 << 0);
1676 }
1677 lp->a.write_bcr(ioaddr, 9, bcr9);
1678 }
1679
1680 for (i = 0; i < PCNET32_MAX_PHYS; i++) {
1681 if (lp->phymask & (1 << i)) {
1682 /* isolate all but the first PHY */
1683 bmcr = mdio_read(dev, i, MII_BMCR);
1684 if (first_phy == -1) {
1685 first_phy = i;
1686 mdio_write(dev, i, MII_BMCR,
1687 bmcr & ~BMCR_ISOLATE);
1688 } else {
1689 mdio_write(dev, i, MII_BMCR,
1690 bmcr | BMCR_ISOLATE);
1691 }
1692 /* use mii_ethtool_sset to setup PHY */
1693 lp->mii_if.phy_id = i;
1694 ecmd.phy_address = i;
1695 if (lp->options & PCNET32_PORT_ASEL) {
1696 mii_ethtool_gset(&lp->mii_if, &ecmd);
1697 ecmd.autoneg = AUTONEG_ENABLE;
1698 }
1699 mii_ethtool_sset(&lp->mii_if, &ecmd);
1700 }
1701 }
1702 lp->mii_if.phy_id = first_phy;
1703 if (netif_msg_link(lp))
1704 printk(KERN_INFO "%s: Using PHY number %d.\n",
1705 dev->name, first_phy);
1653 } 1706 }
1654 }
1655 1707
1656#ifdef DO_DXSUFLO 1708#ifdef DO_DXSUFLO
1657 if (lp->dxsuflo) { /* Disable transmit stop on underflow */ 1709 if (lp->dxsuflo) { /* Disable transmit stop on underflow */
1658 val = lp->a.read_csr (ioaddr, 3); 1710 val = lp->a.read_csr(ioaddr, 3);
1659 val |= 0x40; 1711 val |= 0x40;
1660 lp->a.write_csr (ioaddr, 3, val); 1712 lp->a.write_csr(ioaddr, 3, val);
1661 } 1713 }
1662#endif 1714#endif
1663 1715
1664 lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); 1716 lp->init_block.mode =
1665 pcnet32_load_multicast(dev); 1717 le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
1666 1718 pcnet32_load_multicast(dev);
1667 if (pcnet32_init_ring(dev)) { 1719
1668 rc = -ENOMEM; 1720 if (pcnet32_init_ring(dev)) {
1669 goto err_free_ring; 1721 rc = -ENOMEM;
1670 } 1722 goto err_free_ring;
1671 1723 }
1672 /* Re-initialize the PCNET32, and start it when done. */ 1724
1673 lp->a.write_csr (ioaddr, 1, (lp->dma_addr + 1725 /* Re-initialize the PCNET32, and start it when done. */
1674 offsetof(struct pcnet32_private, init_block)) & 0xffff); 1726 lp->a.write_csr(ioaddr, 1, (lp->dma_addr +
1675 lp->a.write_csr (ioaddr, 2, (lp->dma_addr + 1727 offsetof(struct pcnet32_private,
1676 offsetof(struct pcnet32_private, init_block)) >> 16); 1728 init_block)) & 0xffff);
1677 1729 lp->a.write_csr(ioaddr, 2,
1678 lp->a.write_csr (ioaddr, 4, 0x0915); 1730 (lp->dma_addr +
1679 lp->a.write_csr (ioaddr, 0, 0x0001); 1731 offsetof(struct pcnet32_private, init_block)) >> 16);
1680 1732
1681 netif_start_queue(dev); 1733 lp->a.write_csr(ioaddr, 4, 0x0915);
1682 1734 lp->a.write_csr(ioaddr, 0, 0x0001);
1683 /* If we have mii, print the link status and start the watchdog */ 1735
1684 if (lp->mii) { 1736 netif_start_queue(dev);
1685 mii_check_media (&lp->mii_if, netif_msg_link(lp), 1); 1737
1686 mod_timer (&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT); 1738 /* Print the link status and start the watchdog */
1687 } 1739 pcnet32_check_media(dev, 1);
1688 1740 mod_timer(&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT);
1689 i = 0; 1741
1690 while (i++ < 100) 1742 i = 0;
1691 if (lp->a.read_csr (ioaddr, 0) & 0x0100) 1743 while (i++ < 100)
1692 break; 1744 if (lp->a.read_csr(ioaddr, 0) & 0x0100)
1693 /* 1745 break;
1694 * We used to clear the InitDone bit, 0x0100, here but Mark Stockton 1746 /*
1695 * reports that doing so triggers a bug in the '974. 1747 * We used to clear the InitDone bit, 0x0100, here but Mark Stockton
1696 */ 1748 * reports that doing so triggers a bug in the '974.
1697 lp->a.write_csr (ioaddr, 0, 0x0042); 1749 */
1698 1750 lp->a.write_csr(ioaddr, 0, 0x0042);
1699 if (netif_msg_ifup(lp)) 1751
1700 printk(KERN_DEBUG "%s: pcnet32 open after %d ticks, init block %#x csr0 %4.4x.\n", 1752 if (netif_msg_ifup(lp))
1701 dev->name, i, (u32) (lp->dma_addr + 1753 printk(KERN_DEBUG
1702 offsetof(struct pcnet32_private, init_block)), 1754 "%s: pcnet32 open after %d ticks, init block %#x csr0 %4.4x.\n",
1703 lp->a.read_csr(ioaddr, 0)); 1755 dev->name, i,
1704 1756 (u32) (lp->dma_addr +
1705 spin_unlock_irqrestore(&lp->lock, flags); 1757 offsetof(struct pcnet32_private, init_block)),
1706 1758 lp->a.read_csr(ioaddr, 0));
1707 return 0; /* Always succeed */ 1759
1708 1760 spin_unlock_irqrestore(&lp->lock, flags);
1709err_free_ring: 1761
1710 /* free any allocated skbuffs */ 1762 return 0; /* Always succeed */
1711 for (i = 0; i < lp->rx_ring_size; i++) { 1763
1712 lp->rx_ring[i].status = 0; 1764 err_free_ring:
1713 if (lp->rx_skbuff[i]) { 1765 /* free any allocated skbuffs */
1714 pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2, 1766 for (i = 0; i < lp->rx_ring_size; i++) {
1715 PCI_DMA_FROMDEVICE); 1767 lp->rx_ring[i].status = 0;
1716 dev_kfree_skb(lp->rx_skbuff[i]); 1768 if (lp->rx_skbuff[i]) {
1717 } 1769 pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i],
1718 lp->rx_skbuff[i] = NULL; 1770 PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE);
1719 lp->rx_dma_addr[i] = 0; 1771 dev_kfree_skb(lp->rx_skbuff[i]);
1720 } 1772 }
1721 1773 lp->rx_skbuff[i] = NULL;
1722 pcnet32_free_ring(dev); 1774 lp->rx_dma_addr[i] = 0;
1723 1775 }
1724 /* 1776
1725 * Switch back to 16bit mode to avoid problems with dumb 1777 pcnet32_free_ring(dev);
1726 * DOS packet driver after a warm reboot 1778
1727 */ 1779 /*
1728 lp->a.write_bcr (ioaddr, 20, 4); 1780 * Switch back to 16bit mode to avoid problems with dumb
1729 1781 * DOS packet driver after a warm reboot
1730err_free_irq: 1782 */
1731 spin_unlock_irqrestore(&lp->lock, flags); 1783 lp->a.write_bcr(ioaddr, 20, 4);
1732 free_irq(dev->irq, dev); 1784
1733 return rc; 1785 err_free_irq:
1786 spin_unlock_irqrestore(&lp->lock, flags);
1787 free_irq(dev->irq, dev);
1788 return rc;
1734} 1789}
1735 1790
1736/* 1791/*
@@ -1746,727 +1801,893 @@ err_free_irq:
1746 * restarting the chip, but I'm too lazy to do so right now. dplatt@3do.com 1801 * restarting the chip, but I'm too lazy to do so right now. dplatt@3do.com
1747 */ 1802 */
1748 1803
1749static void 1804static void pcnet32_purge_tx_ring(struct net_device *dev)
1750pcnet32_purge_tx_ring(struct net_device *dev)
1751{ 1805{
1752 struct pcnet32_private *lp = dev->priv; 1806 struct pcnet32_private *lp = dev->priv;
1753 int i; 1807 int i;
1754
1755 for (i = 0; i < lp->tx_ring_size; i++) {
1756 lp->tx_ring[i].status = 0; /* CPU owns buffer */
1757 wmb(); /* Make sure adapter sees owner change */
1758 if (lp->tx_skbuff[i]) {
1759 pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
1760 lp->tx_skbuff[i]->len, PCI_DMA_TODEVICE);
1761 dev_kfree_skb_any(lp->tx_skbuff[i]);
1762 }
1763 lp->tx_skbuff[i] = NULL;
1764 lp->tx_dma_addr[i] = 0;
1765 }
1766}
1767 1808
1809 for (i = 0; i < lp->tx_ring_size; i++) {
1810 lp->tx_ring[i].status = 0; /* CPU owns buffer */
1811 wmb(); /* Make sure adapter sees owner change */
1812 if (lp->tx_skbuff[i]) {
1813 pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
1814 lp->tx_skbuff[i]->len,
1815 PCI_DMA_TODEVICE);
1816 dev_kfree_skb_any(lp->tx_skbuff[i]);
1817 }
1818 lp->tx_skbuff[i] = NULL;
1819 lp->tx_dma_addr[i] = 0;
1820 }
1821}
1768 1822
1769/* Initialize the PCNET32 Rx and Tx rings. */ 1823/* Initialize the PCNET32 Rx and Tx rings. */
1770static int 1824static int pcnet32_init_ring(struct net_device *dev)
1771pcnet32_init_ring(struct net_device *dev)
1772{ 1825{
1773 struct pcnet32_private *lp = dev->priv; 1826 struct pcnet32_private *lp = dev->priv;
1774 int i; 1827 int i;
1775 1828
1776 lp->tx_full = 0; 1829 lp->tx_full = 0;
1777 lp->cur_rx = lp->cur_tx = 0; 1830 lp->cur_rx = lp->cur_tx = 0;
1778 lp->dirty_rx = lp->dirty_tx = 0; 1831 lp->dirty_rx = lp->dirty_tx = 0;
1779 1832
1780 for (i = 0; i < lp->rx_ring_size; i++) { 1833 for (i = 0; i < lp->rx_ring_size; i++) {
1781 struct sk_buff *rx_skbuff = lp->rx_skbuff[i]; 1834 struct sk_buff *rx_skbuff = lp->rx_skbuff[i];
1782 if (rx_skbuff == NULL) { 1835 if (rx_skbuff == NULL) {
1783 if (!(rx_skbuff = lp->rx_skbuff[i] = dev_alloc_skb (PKT_BUF_SZ))) { 1836 if (!
1784 /* there is not much, we can do at this point */ 1837 (rx_skbuff = lp->rx_skbuff[i] =
1785 if (pcnet32_debug & NETIF_MSG_DRV) 1838 dev_alloc_skb(PKT_BUF_SZ))) {
1786 printk(KERN_ERR "%s: pcnet32_init_ring dev_alloc_skb failed.\n", 1839 /* there is not much, we can do at this point */
1787 dev->name); 1840 if (pcnet32_debug & NETIF_MSG_DRV)
1788 return -1; 1841 printk(KERN_ERR
1789 } 1842 "%s: pcnet32_init_ring dev_alloc_skb failed.\n",
1790 skb_reserve (rx_skbuff, 2); 1843 dev->name);
1791 } 1844 return -1;
1792 1845 }
1793 rmb(); 1846 skb_reserve(rx_skbuff, 2);
1794 if (lp->rx_dma_addr[i] == 0) 1847 }
1795 lp->rx_dma_addr[i] = pci_map_single(lp->pci_dev, rx_skbuff->data, 1848
1796 PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); 1849 rmb();
1797 lp->rx_ring[i].base = (u32)le32_to_cpu(lp->rx_dma_addr[i]); 1850 if (lp->rx_dma_addr[i] == 0)
1798 lp->rx_ring[i].buf_length = le16_to_cpu(2-PKT_BUF_SZ); 1851 lp->rx_dma_addr[i] =
1799 wmb(); /* Make sure owner changes after all others are visible */ 1852 pci_map_single(lp->pci_dev, rx_skbuff->data,
1800 lp->rx_ring[i].status = le16_to_cpu(0x8000); 1853 PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE);
1801 } 1854 lp->rx_ring[i].base = (u32) le32_to_cpu(lp->rx_dma_addr[i]);
1802 /* The Tx buffer address is filled in as needed, but we do need to clear 1855 lp->rx_ring[i].buf_length = le16_to_cpu(2 - PKT_BUF_SZ);
1803 * the upper ownership bit. */ 1856 wmb(); /* Make sure owner changes after all others are visible */
1804 for (i = 0; i < lp->tx_ring_size; i++) { 1857 lp->rx_ring[i].status = le16_to_cpu(0x8000);
1805 lp->tx_ring[i].status = 0; /* CPU owns buffer */ 1858 }
1806 wmb(); /* Make sure adapter sees owner change */ 1859 /* The Tx buffer address is filled in as needed, but we do need to clear
1807 lp->tx_ring[i].base = 0; 1860 * the upper ownership bit. */
1808 lp->tx_dma_addr[i] = 0; 1861 for (i = 0; i < lp->tx_ring_size; i++) {
1809 } 1862 lp->tx_ring[i].status = 0; /* CPU owns buffer */
1810 1863 wmb(); /* Make sure adapter sees owner change */
1811 lp->init_block.tlen_rlen = le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits); 1864 lp->tx_ring[i].base = 0;
1812 for (i = 0; i < 6; i++) 1865 lp->tx_dma_addr[i] = 0;
1813 lp->init_block.phys_addr[i] = dev->dev_addr[i]; 1866 }
1814 lp->init_block.rx_ring = (u32)le32_to_cpu(lp->rx_ring_dma_addr); 1867
1815 lp->init_block.tx_ring = (u32)le32_to_cpu(lp->tx_ring_dma_addr); 1868 lp->init_block.tlen_rlen =
1816 wmb(); /* Make sure all changes are visible */ 1869 le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
1817 return 0; 1870 for (i = 0; i < 6; i++)
1871 lp->init_block.phys_addr[i] = dev->dev_addr[i];
1872 lp->init_block.rx_ring = (u32) le32_to_cpu(lp->rx_ring_dma_addr);
1873 lp->init_block.tx_ring = (u32) le32_to_cpu(lp->tx_ring_dma_addr);
1874 wmb(); /* Make sure all changes are visible */
1875 return 0;
1818} 1876}
1819 1877
1820/* the pcnet32 has been issued a stop or reset. Wait for the stop bit 1878/* the pcnet32 has been issued a stop or reset. Wait for the stop bit
1821 * then flush the pending transmit operations, re-initialize the ring, 1879 * then flush the pending transmit operations, re-initialize the ring,
1822 * and tell the chip to initialize. 1880 * and tell the chip to initialize.
1823 */ 1881 */
1824static void 1882static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits)
1825pcnet32_restart(struct net_device *dev, unsigned int csr0_bits)
1826{ 1883{
1827 struct pcnet32_private *lp = dev->priv; 1884 struct pcnet32_private *lp = dev->priv;
1828 unsigned long ioaddr = dev->base_addr; 1885 unsigned long ioaddr = dev->base_addr;
1829 int i; 1886 int i;
1830 1887
1831 /* wait for stop */ 1888 /* wait for stop */
1832 for (i=0; i<100; i++) 1889 for (i = 0; i < 100; i++)
1833 if (lp->a.read_csr(ioaddr, 0) & 0x0004) 1890 if (lp->a.read_csr(ioaddr, 0) & 0x0004)
1834 break; 1891 break;
1835 1892
1836 if (i >= 100 && netif_msg_drv(lp)) 1893 if (i >= 100 && netif_msg_drv(lp))
1837 printk(KERN_ERR "%s: pcnet32_restart timed out waiting for stop.\n", 1894 printk(KERN_ERR
1838 dev->name); 1895 "%s: pcnet32_restart timed out waiting for stop.\n",
1896 dev->name);
1839 1897
1840 pcnet32_purge_tx_ring(dev); 1898 pcnet32_purge_tx_ring(dev);
1841 if (pcnet32_init_ring(dev)) 1899 if (pcnet32_init_ring(dev))
1842 return; 1900 return;
1843 1901
1844 /* ReInit Ring */ 1902 /* ReInit Ring */
1845 lp->a.write_csr (ioaddr, 0, 1); 1903 lp->a.write_csr(ioaddr, 0, 1);
1846 i = 0; 1904 i = 0;
1847 while (i++ < 1000) 1905 while (i++ < 1000)
1848 if (lp->a.read_csr (ioaddr, 0) & 0x0100) 1906 if (lp->a.read_csr(ioaddr, 0) & 0x0100)
1849 break; 1907 break;
1850 1908
1851 lp->a.write_csr (ioaddr, 0, csr0_bits); 1909 lp->a.write_csr(ioaddr, 0, csr0_bits);
1852} 1910}
1853 1911
1854 1912static void pcnet32_tx_timeout(struct net_device *dev)
1855static void
1856pcnet32_tx_timeout (struct net_device *dev)
1857{ 1913{
1858 struct pcnet32_private *lp = dev->priv; 1914 struct pcnet32_private *lp = dev->priv;
1859 unsigned long ioaddr = dev->base_addr, flags; 1915 unsigned long ioaddr = dev->base_addr, flags;
1860 1916
1861 spin_lock_irqsave(&lp->lock, flags); 1917 spin_lock_irqsave(&lp->lock, flags);
1862 /* Transmitter timeout, serious problems. */ 1918 /* Transmitter timeout, serious problems. */
1863 if (pcnet32_debug & NETIF_MSG_DRV) 1919 if (pcnet32_debug & NETIF_MSG_DRV)
1864 printk(KERN_ERR "%s: transmit timed out, status %4.4x, resetting.\n", 1920 printk(KERN_ERR
1865 dev->name, lp->a.read_csr(ioaddr, 0)); 1921 "%s: transmit timed out, status %4.4x, resetting.\n",
1866 lp->a.write_csr (ioaddr, 0, 0x0004); 1922 dev->name, lp->a.read_csr(ioaddr, 0));
1867 lp->stats.tx_errors++; 1923 lp->a.write_csr(ioaddr, 0, 0x0004);
1868 if (netif_msg_tx_err(lp)) { 1924 lp->stats.tx_errors++;
1869 int i; 1925 if (netif_msg_tx_err(lp)) {
1870 printk(KERN_DEBUG " Ring data dump: dirty_tx %d cur_tx %d%s cur_rx %d.", 1926 int i;
1871 lp->dirty_tx, lp->cur_tx, lp->tx_full ? " (full)" : "", 1927 printk(KERN_DEBUG
1872 lp->cur_rx); 1928 " Ring data dump: dirty_tx %d cur_tx %d%s cur_rx %d.",
1873 for (i = 0 ; i < lp->rx_ring_size; i++) 1929 lp->dirty_tx, lp->cur_tx, lp->tx_full ? " (full)" : "",
1874 printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ", 1930 lp->cur_rx);
1875 le32_to_cpu(lp->rx_ring[i].base), 1931 for (i = 0; i < lp->rx_ring_size; i++)
1876 (-le16_to_cpu(lp->rx_ring[i].buf_length)) & 0xffff, 1932 printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ",
1877 le32_to_cpu(lp->rx_ring[i].msg_length), 1933 le32_to_cpu(lp->rx_ring[i].base),
1878 le16_to_cpu(lp->rx_ring[i].status)); 1934 (-le16_to_cpu(lp->rx_ring[i].buf_length)) &
1879 for (i = 0 ; i < lp->tx_ring_size; i++) 1935 0xffff, le32_to_cpu(lp->rx_ring[i].msg_length),
1880 printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ", 1936 le16_to_cpu(lp->rx_ring[i].status));
1881 le32_to_cpu(lp->tx_ring[i].base), 1937 for (i = 0; i < lp->tx_ring_size; i++)
1882 (-le16_to_cpu(lp->tx_ring[i].length)) & 0xffff, 1938 printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ",
1883 le32_to_cpu(lp->tx_ring[i].misc), 1939 le32_to_cpu(lp->tx_ring[i].base),
1884 le16_to_cpu(lp->tx_ring[i].status)); 1940 (-le16_to_cpu(lp->tx_ring[i].length)) & 0xffff,
1885 printk("\n"); 1941 le32_to_cpu(lp->tx_ring[i].misc),
1886 } 1942 le16_to_cpu(lp->tx_ring[i].status));
1887 pcnet32_restart(dev, 0x0042); 1943 printk("\n");
1888 1944 }
1889 dev->trans_start = jiffies; 1945 pcnet32_restart(dev, 0x0042);
1890 netif_wake_queue(dev); 1946
1891 1947 dev->trans_start = jiffies;
1892 spin_unlock_irqrestore(&lp->lock, flags); 1948 netif_wake_queue(dev);
1893}
1894 1949
1950 spin_unlock_irqrestore(&lp->lock, flags);
1951}
1895 1952
1896static int 1953static int pcnet32_start_xmit(struct sk_buff *skb, struct net_device *dev)
1897pcnet32_start_xmit(struct sk_buff *skb, struct net_device *dev)
1898{ 1954{
1899 struct pcnet32_private *lp = dev->priv; 1955 struct pcnet32_private *lp = dev->priv;
1900 unsigned long ioaddr = dev->base_addr; 1956 unsigned long ioaddr = dev->base_addr;
1901 u16 status; 1957 u16 status;
1902 int entry; 1958 int entry;
1903 unsigned long flags; 1959 unsigned long flags;
1904 1960
1905 spin_lock_irqsave(&lp->lock, flags); 1961 spin_lock_irqsave(&lp->lock, flags);
1906 1962
1907 if (netif_msg_tx_queued(lp)) { 1963 if (netif_msg_tx_queued(lp)) {
1908 printk(KERN_DEBUG "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n", 1964 printk(KERN_DEBUG
1909 dev->name, lp->a.read_csr(ioaddr, 0)); 1965 "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n",
1910 } 1966 dev->name, lp->a.read_csr(ioaddr, 0));
1967 }
1911 1968
1912 /* Default status -- will not enable Successful-TxDone 1969 /* Default status -- will not enable Successful-TxDone
1913 * interrupt when that option is available to us. 1970 * interrupt when that option is available to us.
1914 */ 1971 */
1915 status = 0x8300; 1972 status = 0x8300;
1916 1973
1917 /* Fill in a Tx ring entry */ 1974 /* Fill in a Tx ring entry */
1918 1975
1919 /* Mask to ring buffer boundary. */ 1976 /* Mask to ring buffer boundary. */
1920 entry = lp->cur_tx & lp->tx_mod_mask; 1977 entry = lp->cur_tx & lp->tx_mod_mask;
1921 1978
1922 /* Caution: the write order is important here, set the status 1979 /* Caution: the write order is important here, set the status
1923 * with the "ownership" bits last. */ 1980 * with the "ownership" bits last. */
1924 1981
1925 lp->tx_ring[entry].length = le16_to_cpu(-skb->len); 1982 lp->tx_ring[entry].length = le16_to_cpu(-skb->len);
1926 1983
1927 lp->tx_ring[entry].misc = 0x00000000; 1984 lp->tx_ring[entry].misc = 0x00000000;
1928 1985
1929 lp->tx_skbuff[entry] = skb; 1986 lp->tx_skbuff[entry] = skb;
1930 lp->tx_dma_addr[entry] = pci_map_single(lp->pci_dev, skb->data, skb->len, 1987 lp->tx_dma_addr[entry] =
1931 PCI_DMA_TODEVICE); 1988 pci_map_single(lp->pci_dev, skb->data, skb->len, PCI_DMA_TODEVICE);
1932 lp->tx_ring[entry].base = (u32)le32_to_cpu(lp->tx_dma_addr[entry]); 1989 lp->tx_ring[entry].base = (u32) le32_to_cpu(lp->tx_dma_addr[entry]);
1933 wmb(); /* Make sure owner changes after all others are visible */ 1990 wmb(); /* Make sure owner changes after all others are visible */
1934 lp->tx_ring[entry].status = le16_to_cpu(status); 1991 lp->tx_ring[entry].status = le16_to_cpu(status);
1935 1992
1936 lp->cur_tx++; 1993 lp->cur_tx++;
1937 lp->stats.tx_bytes += skb->len; 1994 lp->stats.tx_bytes += skb->len;
1938 1995
1939 /* Trigger an immediate send poll. */ 1996 /* Trigger an immediate send poll. */
1940 lp->a.write_csr (ioaddr, 0, 0x0048); 1997 lp->a.write_csr(ioaddr, 0, 0x0048);
1941 1998
1942 dev->trans_start = jiffies; 1999 dev->trans_start = jiffies;
1943 2000
1944 if (lp->tx_ring[(entry+1) & lp->tx_mod_mask].base != 0) { 2001 if (lp->tx_ring[(entry + 1) & lp->tx_mod_mask].base != 0) {
1945 lp->tx_full = 1; 2002 lp->tx_full = 1;
1946 netif_stop_queue(dev); 2003 netif_stop_queue(dev);
1947 } 2004 }
1948 spin_unlock_irqrestore(&lp->lock, flags); 2005 spin_unlock_irqrestore(&lp->lock, flags);
1949 return 0; 2006 return 0;
1950} 2007}
1951 2008
1952/* The PCNET32 interrupt handler. */ 2009/* The PCNET32 interrupt handler. */
1953static irqreturn_t 2010static irqreturn_t
1954pcnet32_interrupt(int irq, void *dev_id, struct pt_regs * regs) 2011pcnet32_interrupt(int irq, void *dev_id, struct pt_regs *regs)
1955{ 2012{
1956 struct net_device *dev = dev_id; 2013 struct net_device *dev = dev_id;
1957 struct pcnet32_private *lp; 2014 struct pcnet32_private *lp;
1958 unsigned long ioaddr; 2015 unsigned long ioaddr;
1959 u16 csr0,rap; 2016 u16 csr0, rap;
1960 int boguscnt = max_interrupt_work; 2017 int boguscnt = max_interrupt_work;
1961 int must_restart; 2018 int must_restart;
1962 2019
1963 if (!dev) { 2020 if (!dev) {
1964 if (pcnet32_debug & NETIF_MSG_INTR) 2021 if (pcnet32_debug & NETIF_MSG_INTR)
1965 printk (KERN_DEBUG "%s(): irq %d for unknown device\n", 2022 printk(KERN_DEBUG "%s(): irq %d for unknown device\n",
1966 __FUNCTION__, irq); 2023 __FUNCTION__, irq);
1967 return IRQ_NONE; 2024 return IRQ_NONE;
1968 }
1969
1970 ioaddr = dev->base_addr;
1971 lp = dev->priv;
1972
1973 spin_lock(&lp->lock);
1974
1975 rap = lp->a.read_rap(ioaddr);
1976 while ((csr0 = lp->a.read_csr (ioaddr, 0)) & 0x8f00 && --boguscnt >= 0) {
1977 if (csr0 == 0xffff) {
1978 break; /* PCMCIA remove happened */
1979 } 2025 }
1980 /* Acknowledge all of the current interrupt sources ASAP. */
1981 lp->a.write_csr (ioaddr, 0, csr0 & ~0x004f);
1982 2026
1983 must_restart = 0; 2027 ioaddr = dev->base_addr;
2028 lp = dev->priv;
1984 2029
1985 if (netif_msg_intr(lp)) 2030 spin_lock(&lp->lock);
1986 printk(KERN_DEBUG "%s: interrupt csr0=%#2.2x new csr=%#2.2x.\n", 2031
1987 dev->name, csr0, lp->a.read_csr (ioaddr, 0)); 2032 rap = lp->a.read_rap(ioaddr);
1988 2033 while ((csr0 = lp->a.read_csr(ioaddr, 0)) & 0x8f00 && --boguscnt >= 0) {
1989 if (csr0 & 0x0400) /* Rx interrupt */ 2034 if (csr0 == 0xffff) {
1990 pcnet32_rx(dev); 2035 break; /* PCMCIA remove happened */
1991 2036 }
1992 if (csr0 & 0x0200) { /* Tx-done interrupt */ 2037 /* Acknowledge all of the current interrupt sources ASAP. */
1993 unsigned int dirty_tx = lp->dirty_tx; 2038 lp->a.write_csr(ioaddr, 0, csr0 & ~0x004f);
1994 int delta; 2039
1995 2040 must_restart = 0;
1996 while (dirty_tx != lp->cur_tx) { 2041
1997 int entry = dirty_tx & lp->tx_mod_mask; 2042 if (netif_msg_intr(lp))
1998 int status = (short)le16_to_cpu(lp->tx_ring[entry].status); 2043 printk(KERN_DEBUG
1999 2044 "%s: interrupt csr0=%#2.2x new csr=%#2.2x.\n",
2000 if (status < 0) 2045 dev->name, csr0, lp->a.read_csr(ioaddr, 0));
2001 break; /* It still hasn't been Txed */ 2046
2002 2047 if (csr0 & 0x0400) /* Rx interrupt */
2003 lp->tx_ring[entry].base = 0; 2048 pcnet32_rx(dev);
2004 2049
2005 if (status & 0x4000) { 2050 if (csr0 & 0x0200) { /* Tx-done interrupt */
2006 /* There was an major error, log it. */ 2051 unsigned int dirty_tx = lp->dirty_tx;
2007 int err_status = le32_to_cpu(lp->tx_ring[entry].misc); 2052 int delta;
2008 lp->stats.tx_errors++; 2053
2009 if (netif_msg_tx_err(lp)) 2054 while (dirty_tx != lp->cur_tx) {
2010 printk(KERN_ERR "%s: Tx error status=%04x err_status=%08x\n", 2055 int entry = dirty_tx & lp->tx_mod_mask;
2011 dev->name, status, err_status); 2056 int status =
2012 if (err_status & 0x04000000) lp->stats.tx_aborted_errors++; 2057 (short)le16_to_cpu(lp->tx_ring[entry].
2013 if (err_status & 0x08000000) lp->stats.tx_carrier_errors++; 2058 status);
2014 if (err_status & 0x10000000) lp->stats.tx_window_errors++; 2059
2060 if (status < 0)
2061 break; /* It still hasn't been Txed */
2062
2063 lp->tx_ring[entry].base = 0;
2064
2065 if (status & 0x4000) {
2066 /* There was an major error, log it. */
2067 int err_status =
2068 le32_to_cpu(lp->tx_ring[entry].
2069 misc);
2070 lp->stats.tx_errors++;
2071 if (netif_msg_tx_err(lp))
2072 printk(KERN_ERR
2073 "%s: Tx error status=%04x err_status=%08x\n",
2074 dev->name, status,
2075 err_status);
2076 if (err_status & 0x04000000)
2077 lp->stats.tx_aborted_errors++;
2078 if (err_status & 0x08000000)
2079 lp->stats.tx_carrier_errors++;
2080 if (err_status & 0x10000000)
2081 lp->stats.tx_window_errors++;
2015#ifndef DO_DXSUFLO 2082#ifndef DO_DXSUFLO
2016 if (err_status & 0x40000000) { 2083 if (err_status & 0x40000000) {
2017 lp->stats.tx_fifo_errors++; 2084 lp->stats.tx_fifo_errors++;
2018 /* Ackk! On FIFO errors the Tx unit is turned off! */ 2085 /* Ackk! On FIFO errors the Tx unit is turned off! */
2019 /* Remove this verbosity later! */ 2086 /* Remove this verbosity later! */
2020 if (netif_msg_tx_err(lp)) 2087 if (netif_msg_tx_err(lp))
2021 printk(KERN_ERR "%s: Tx FIFO error! CSR0=%4.4x\n", 2088 printk(KERN_ERR
2022 dev->name, csr0); 2089 "%s: Tx FIFO error! CSR0=%4.4x\n",
2023 must_restart = 1; 2090 dev->name, csr0);
2024 } 2091 must_restart = 1;
2092 }
2025#else 2093#else
2026 if (err_status & 0x40000000) { 2094 if (err_status & 0x40000000) {
2027 lp->stats.tx_fifo_errors++; 2095 lp->stats.tx_fifo_errors++;
2028 if (! lp->dxsuflo) { /* If controller doesn't recover ... */ 2096 if (!lp->dxsuflo) { /* If controller doesn't recover ... */
2029 /* Ackk! On FIFO errors the Tx unit is turned off! */ 2097 /* Ackk! On FIFO errors the Tx unit is turned off! */
2030 /* Remove this verbosity later! */ 2098 /* Remove this verbosity later! */
2031 if (netif_msg_tx_err(lp)) 2099 if (netif_msg_tx_err
2032 printk(KERN_ERR "%s: Tx FIFO error! CSR0=%4.4x\n", 2100 (lp))
2033 dev->name, csr0); 2101 printk(KERN_ERR
2034 must_restart = 1; 2102 "%s: Tx FIFO error! CSR0=%4.4x\n",
2035 } 2103 dev->
2036 } 2104 name,
2105 csr0);
2106 must_restart = 1;
2107 }
2108 }
2037#endif 2109#endif
2038 } else { 2110 } else {
2039 if (status & 0x1800) 2111 if (status & 0x1800)
2040 lp->stats.collisions++; 2112 lp->stats.collisions++;
2041 lp->stats.tx_packets++; 2113 lp->stats.tx_packets++;
2114 }
2115
2116 /* We must free the original skb */
2117 if (lp->tx_skbuff[entry]) {
2118 pci_unmap_single(lp->pci_dev,
2119 lp->tx_dma_addr[entry],
2120 lp->tx_skbuff[entry]->
2121 len, PCI_DMA_TODEVICE);
2122 dev_kfree_skb_irq(lp->tx_skbuff[entry]);
2123 lp->tx_skbuff[entry] = NULL;
2124 lp->tx_dma_addr[entry] = 0;
2125 }
2126 dirty_tx++;
2127 }
2128
2129 delta =
2130 (lp->cur_tx - dirty_tx) & (lp->tx_mod_mask +
2131 lp->tx_ring_size);
2132 if (delta > lp->tx_ring_size) {
2133 if (netif_msg_drv(lp))
2134 printk(KERN_ERR
2135 "%s: out-of-sync dirty pointer, %d vs. %d, full=%d.\n",
2136 dev->name, dirty_tx, lp->cur_tx,
2137 lp->tx_full);
2138 dirty_tx += lp->tx_ring_size;
2139 delta -= lp->tx_ring_size;
2140 }
2141
2142 if (lp->tx_full &&
2143 netif_queue_stopped(dev) &&
2144 delta < lp->tx_ring_size - 2) {
2145 /* The ring is no longer full, clear tbusy. */
2146 lp->tx_full = 0;
2147 netif_wake_queue(dev);
2148 }
2149 lp->dirty_tx = dirty_tx;
2150 }
2151
2152 /* Log misc errors. */
2153 if (csr0 & 0x4000)
2154 lp->stats.tx_errors++; /* Tx babble. */
2155 if (csr0 & 0x1000) {
2156 /*
2157 * this happens when our receive ring is full. This shouldn't
2158 * be a problem as we will see normal rx interrupts for the frames
2159 * in the receive ring. But there are some PCI chipsets (I can
2160 * reproduce this on SP3G with Intel saturn chipset) which have
2161 * sometimes problems and will fill up the receive ring with
2162 * error descriptors. In this situation we don't get a rx
2163 * interrupt, but a missed frame interrupt sooner or later.
2164 * So we try to clean up our receive ring here.
2165 */
2166 pcnet32_rx(dev);
2167 lp->stats.rx_errors++; /* Missed a Rx frame. */
2168 }
2169 if (csr0 & 0x0800) {
2170 if (netif_msg_drv(lp))
2171 printk(KERN_ERR
2172 "%s: Bus master arbitration failure, status %4.4x.\n",
2173 dev->name, csr0);
2174 /* unlike for the lance, there is no restart needed */
2042 } 2175 }
2043 2176
2044 /* We must free the original skb */ 2177 if (must_restart) {
2045 if (lp->tx_skbuff[entry]) { 2178 /* reset the chip to clear the error condition, then restart */
2046 pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[entry], 2179 lp->a.reset(ioaddr);
2047 lp->tx_skbuff[entry]->len, PCI_DMA_TODEVICE); 2180 lp->a.write_csr(ioaddr, 4, 0x0915);
2048 dev_kfree_skb_irq(lp->tx_skbuff[entry]); 2181 pcnet32_restart(dev, 0x0002);
2049 lp->tx_skbuff[entry] = NULL; 2182 netif_wake_queue(dev);
2050 lp->tx_dma_addr[entry] = 0;
2051 } 2183 }
2052 dirty_tx++; 2184 }
2053 } 2185
2054 2186 /* Set interrupt enable. */
2055 delta = (lp->cur_tx - dirty_tx) & (lp->tx_mod_mask + lp->tx_ring_size); 2187 lp->a.write_csr(ioaddr, 0, 0x0040);
2056 if (delta > lp->tx_ring_size) { 2188 lp->a.write_rap(ioaddr, rap);
2057 if (netif_msg_drv(lp)) 2189
2058 printk(KERN_ERR "%s: out-of-sync dirty pointer, %d vs. %d, full=%d.\n", 2190 if (netif_msg_intr(lp))
2059 dev->name, dirty_tx, lp->cur_tx, lp->tx_full); 2191 printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n",
2060 dirty_tx += lp->tx_ring_size; 2192 dev->name, lp->a.read_csr(ioaddr, 0));
2061 delta -= lp->tx_ring_size; 2193
2062 } 2194 spin_unlock(&lp->lock);
2063 2195
2064 if (lp->tx_full && 2196 return IRQ_HANDLED;
2065 netif_queue_stopped(dev) &&
2066 delta < lp->tx_ring_size - 2) {
2067 /* The ring is no longer full, clear tbusy. */
2068 lp->tx_full = 0;
2069 netif_wake_queue (dev);
2070 }
2071 lp->dirty_tx = dirty_tx;
2072 }
2073
2074 /* Log misc errors. */
2075 if (csr0 & 0x4000) lp->stats.tx_errors++; /* Tx babble. */
2076 if (csr0 & 0x1000) {
2077 /*
2078 * this happens when our receive ring is full. This shouldn't
2079 * be a problem as we will see normal rx interrupts for the frames
2080 * in the receive ring. But there are some PCI chipsets (I can
2081 * reproduce this on SP3G with Intel saturn chipset) which have
2082 * sometimes problems and will fill up the receive ring with
2083 * error descriptors. In this situation we don't get a rx
2084 * interrupt, but a missed frame interrupt sooner or later.
2085 * So we try to clean up our receive ring here.
2086 */
2087 pcnet32_rx(dev);
2088 lp->stats.rx_errors++; /* Missed a Rx frame. */
2089 }
2090 if (csr0 & 0x0800) {
2091 if (netif_msg_drv(lp))
2092 printk(KERN_ERR "%s: Bus master arbitration failure, status %4.4x.\n",
2093 dev->name, csr0);
2094 /* unlike for the lance, there is no restart needed */
2095 }
2096
2097 if (must_restart) {
2098 /* reset the chip to clear the error condition, then restart */
2099 lp->a.reset(ioaddr);
2100 lp->a.write_csr(ioaddr, 4, 0x0915);
2101 pcnet32_restart(dev, 0x0002);
2102 netif_wake_queue(dev);
2103 }
2104 }
2105
2106 /* Set interrupt enable. */
2107 lp->a.write_csr (ioaddr, 0, 0x0040);
2108 lp->a.write_rap (ioaddr,rap);
2109
2110 if (netif_msg_intr(lp))
2111 printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n",
2112 dev->name, lp->a.read_csr (ioaddr, 0));
2113
2114 spin_unlock(&lp->lock);
2115
2116 return IRQ_HANDLED;
2117} 2197}
2118 2198
2119static int 2199static int pcnet32_rx(struct net_device *dev)
2120pcnet32_rx(struct net_device *dev)
2121{ 2200{
2122 struct pcnet32_private *lp = dev->priv; 2201 struct pcnet32_private *lp = dev->priv;
2123 int entry = lp->cur_rx & lp->rx_mod_mask; 2202 int entry = lp->cur_rx & lp->rx_mod_mask;
2124 int boguscnt = lp->rx_ring_size / 2; 2203 int boguscnt = lp->rx_ring_size / 2;
2125 2204
2126 /* If we own the next entry, it's a new packet. Send it up. */ 2205 /* If we own the next entry, it's a new packet. Send it up. */
2127 while ((short)le16_to_cpu(lp->rx_ring[entry].status) >= 0) { 2206 while ((short)le16_to_cpu(lp->rx_ring[entry].status) >= 0) {
2128 int status = (short)le16_to_cpu(lp->rx_ring[entry].status) >> 8; 2207 int status = (short)le16_to_cpu(lp->rx_ring[entry].status) >> 8;
2129 2208
2130 if (status != 0x03) { /* There was an error. */ 2209 if (status != 0x03) { /* There was an error. */
2131 /* 2210 /*
2132 * There is a tricky error noted by John Murphy, 2211 * There is a tricky error noted by John Murphy,
2133 * <murf@perftech.com> to Russ Nelson: Even with full-sized 2212 * <murf@perftech.com> to Russ Nelson: Even with full-sized
2134 * buffers it's possible for a jabber packet to use two 2213 * buffers it's possible for a jabber packet to use two
2135 * buffers, with only the last correctly noting the error. 2214 * buffers, with only the last correctly noting the error.
2136 */ 2215 */
2137 if (status & 0x01) /* Only count a general error at the */ 2216 if (status & 0x01) /* Only count a general error at the */
2138 lp->stats.rx_errors++; /* end of a packet.*/ 2217 lp->stats.rx_errors++; /* end of a packet. */
2139 if (status & 0x20) lp->stats.rx_frame_errors++; 2218 if (status & 0x20)
2140 if (status & 0x10) lp->stats.rx_over_errors++; 2219 lp->stats.rx_frame_errors++;
2141 if (status & 0x08) lp->stats.rx_crc_errors++; 2220 if (status & 0x10)
2142 if (status & 0x04) lp->stats.rx_fifo_errors++; 2221 lp->stats.rx_over_errors++;
2143 lp->rx_ring[entry].status &= le16_to_cpu(0x03ff); 2222 if (status & 0x08)
2144 } else { 2223 lp->stats.rx_crc_errors++;
2145 /* Malloc up new buffer, compatible with net-2e. */ 2224 if (status & 0x04)
2146 short pkt_len = (le32_to_cpu(lp->rx_ring[entry].msg_length) & 0xfff)-4; 2225 lp->stats.rx_fifo_errors++;
2147 struct sk_buff *skb; 2226 lp->rx_ring[entry].status &= le16_to_cpu(0x03ff);
2148
2149 /* Discard oversize frames. */
2150 if (unlikely(pkt_len > PKT_BUF_SZ - 2)) {
2151 if (netif_msg_drv(lp))
2152 printk(KERN_ERR "%s: Impossible packet size %d!\n",
2153 dev->name, pkt_len);
2154 lp->stats.rx_errors++;
2155 } else if (pkt_len < 60) {
2156 if (netif_msg_rx_err(lp))
2157 printk(KERN_ERR "%s: Runt packet!\n", dev->name);
2158 lp->stats.rx_errors++;
2159 } else {
2160 int rx_in_place = 0;
2161
2162 if (pkt_len > rx_copybreak) {
2163 struct sk_buff *newskb;
2164
2165 if ((newskb = dev_alloc_skb(PKT_BUF_SZ))) {
2166 skb_reserve (newskb, 2);
2167 skb = lp->rx_skbuff[entry];
2168 pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[entry],
2169 PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE);
2170 skb_put (skb, pkt_len);
2171 lp->rx_skbuff[entry] = newskb;
2172 newskb->dev = dev;
2173 lp->rx_dma_addr[entry] =
2174 pci_map_single(lp->pci_dev, newskb->data,
2175 PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE);
2176 lp->rx_ring[entry].base = le32_to_cpu(lp->rx_dma_addr[entry]);
2177 rx_in_place = 1;
2178 } else
2179 skb = NULL;
2180 } else { 2227 } else {
2181 skb = dev_alloc_skb(pkt_len+2); 2228 /* Malloc up new buffer, compatible with net-2e. */
2182 } 2229 short pkt_len =
2183 2230 (le32_to_cpu(lp->rx_ring[entry].msg_length) & 0xfff)
2184 if (skb == NULL) { 2231 - 4;
2185 int i; 2232 struct sk_buff *skb;
2186 if (netif_msg_drv(lp)) 2233
2187 printk(KERN_ERR "%s: Memory squeeze, deferring packet.\n", 2234 /* Discard oversize frames. */
2188 dev->name); 2235 if (unlikely(pkt_len > PKT_BUF_SZ - 2)) {
2189 for (i = 0; i < lp->rx_ring_size; i++) 2236 if (netif_msg_drv(lp))
2190 if ((short)le16_to_cpu(lp->rx_ring[(entry+i) 2237 printk(KERN_ERR
2191 & lp->rx_mod_mask].status) < 0) 2238 "%s: Impossible packet size %d!\n",
2192 break; 2239 dev->name, pkt_len);
2193 2240 lp->stats.rx_errors++;
2194 if (i > lp->rx_ring_size -2) { 2241 } else if (pkt_len < 60) {
2195 lp->stats.rx_dropped++; 2242 if (netif_msg_rx_err(lp))
2196 lp->rx_ring[entry].status |= le16_to_cpu(0x8000); 2243 printk(KERN_ERR "%s: Runt packet!\n",
2197 wmb(); /* Make sure adapter sees owner change */ 2244 dev->name);
2198 lp->cur_rx++; 2245 lp->stats.rx_errors++;
2199 } 2246 } else {
2200 break; 2247 int rx_in_place = 0;
2201 } 2248
2202 skb->dev = dev; 2249 if (pkt_len > rx_copybreak) {
2203 if (!rx_in_place) { 2250 struct sk_buff *newskb;
2204 skb_reserve(skb,2); /* 16 byte align */ 2251
2205 skb_put(skb,pkt_len); /* Make room */ 2252 if ((newskb =
2206 pci_dma_sync_single_for_cpu(lp->pci_dev, 2253 dev_alloc_skb(PKT_BUF_SZ))) {
2207 lp->rx_dma_addr[entry], 2254 skb_reserve(newskb, 2);
2208 PKT_BUF_SZ-2, 2255 skb = lp->rx_skbuff[entry];
2209 PCI_DMA_FROMDEVICE); 2256 pci_unmap_single(lp->pci_dev,
2210 eth_copy_and_sum(skb, 2257 lp->
2211 (unsigned char *)(lp->rx_skbuff[entry]->data), 2258 rx_dma_addr
2212 pkt_len,0); 2259 [entry],
2213 pci_dma_sync_single_for_device(lp->pci_dev, 2260 PKT_BUF_SZ - 2,
2214 lp->rx_dma_addr[entry], 2261 PCI_DMA_FROMDEVICE);
2215 PKT_BUF_SZ-2, 2262 skb_put(skb, pkt_len);
2216 PCI_DMA_FROMDEVICE); 2263 lp->rx_skbuff[entry] = newskb;
2264 newskb->dev = dev;
2265 lp->rx_dma_addr[entry] =
2266 pci_map_single(lp->pci_dev,
2267 newskb->data,
2268 PKT_BUF_SZ -
2269 2,
2270 PCI_DMA_FROMDEVICE);
2271 lp->rx_ring[entry].base =
2272 le32_to_cpu(lp->
2273 rx_dma_addr
2274 [entry]);
2275 rx_in_place = 1;
2276 } else
2277 skb = NULL;
2278 } else {
2279 skb = dev_alloc_skb(pkt_len + 2);
2280 }
2281
2282 if (skb == NULL) {
2283 int i;
2284 if (netif_msg_drv(lp))
2285 printk(KERN_ERR
2286 "%s: Memory squeeze, deferring packet.\n",
2287 dev->name);
2288 for (i = 0; i < lp->rx_ring_size; i++)
2289 if ((short)
2290 le16_to_cpu(lp->
2291 rx_ring[(entry +
2292 i)
2293 & lp->
2294 rx_mod_mask].
2295 status) < 0)
2296 break;
2297
2298 if (i > lp->rx_ring_size - 2) {
2299 lp->stats.rx_dropped++;
2300 lp->rx_ring[entry].status |=
2301 le16_to_cpu(0x8000);
2302 wmb(); /* Make sure adapter sees owner change */
2303 lp->cur_rx++;
2304 }
2305 break;
2306 }
2307 skb->dev = dev;
2308 if (!rx_in_place) {
2309 skb_reserve(skb, 2); /* 16 byte align */
2310 skb_put(skb, pkt_len); /* Make room */
2311 pci_dma_sync_single_for_cpu(lp->pci_dev,
2312 lp->
2313 rx_dma_addr
2314 [entry],
2315 PKT_BUF_SZ -
2316 2,
2317 PCI_DMA_FROMDEVICE);
2318 eth_copy_and_sum(skb,
2319 (unsigned char *)(lp->
2320 rx_skbuff
2321 [entry]->
2322 data),
2323 pkt_len, 0);
2324 pci_dma_sync_single_for_device(lp->
2325 pci_dev,
2326 lp->
2327 rx_dma_addr
2328 [entry],
2329 PKT_BUF_SZ
2330 - 2,
2331 PCI_DMA_FROMDEVICE);
2332 }
2333 lp->stats.rx_bytes += skb->len;
2334 skb->protocol = eth_type_trans(skb, dev);
2335 netif_rx(skb);
2336 dev->last_rx = jiffies;
2337 lp->stats.rx_packets++;
2338 }
2217 } 2339 }
2218 lp->stats.rx_bytes += skb->len; 2340 /*
2219 skb->protocol=eth_type_trans(skb,dev); 2341 * The docs say that the buffer length isn't touched, but Andrew Boyd
2220 netif_rx(skb); 2342 * of QNX reports that some revs of the 79C965 clear it.
2221 dev->last_rx = jiffies; 2343 */
2222 lp->stats.rx_packets++; 2344 lp->rx_ring[entry].buf_length = le16_to_cpu(2 - PKT_BUF_SZ);
2223 } 2345 wmb(); /* Make sure owner changes after all others are visible */
2346 lp->rx_ring[entry].status |= le16_to_cpu(0x8000);
2347 entry = (++lp->cur_rx) & lp->rx_mod_mask;
2348 if (--boguscnt <= 0)
2349 break; /* don't stay in loop forever */
2224 } 2350 }
2225 /* 2351
2226 * The docs say that the buffer length isn't touched, but Andrew Boyd 2352 return 0;
2227 * of QNX reports that some revs of the 79C965 clear it.
2228 */
2229 lp->rx_ring[entry].buf_length = le16_to_cpu(2-PKT_BUF_SZ);
2230 wmb(); /* Make sure owner changes after all others are visible */
2231 lp->rx_ring[entry].status |= le16_to_cpu(0x8000);
2232 entry = (++lp->cur_rx) & lp->rx_mod_mask;
2233 if (--boguscnt <= 0) break; /* don't stay in loop forever */
2234 }
2235
2236 return 0;
2237} 2353}
2238 2354
2239static int 2355static int pcnet32_close(struct net_device *dev)
2240pcnet32_close(struct net_device *dev)
2241{ 2356{
2242 unsigned long ioaddr = dev->base_addr; 2357 unsigned long ioaddr = dev->base_addr;
2243 struct pcnet32_private *lp = dev->priv; 2358 struct pcnet32_private *lp = dev->priv;
2244 int i; 2359 int i;
2245 unsigned long flags; 2360 unsigned long flags;
2246 2361
2247 del_timer_sync(&lp->watchdog_timer); 2362 del_timer_sync(&lp->watchdog_timer);
2248 2363
2249 netif_stop_queue(dev); 2364 netif_stop_queue(dev);
2250 2365
2251 spin_lock_irqsave(&lp->lock, flags); 2366 spin_lock_irqsave(&lp->lock, flags);
2252 2367
2253 lp->stats.rx_missed_errors = lp->a.read_csr (ioaddr, 112); 2368 lp->stats.rx_missed_errors = lp->a.read_csr(ioaddr, 112);
2254 2369
2255 if (netif_msg_ifdown(lp)) 2370 if (netif_msg_ifdown(lp))
2256 printk(KERN_DEBUG "%s: Shutting down ethercard, status was %2.2x.\n", 2371 printk(KERN_DEBUG
2257 dev->name, lp->a.read_csr (ioaddr, 0)); 2372 "%s: Shutting down ethercard, status was %2.2x.\n",
2373 dev->name, lp->a.read_csr(ioaddr, 0));
2258 2374
2259 /* We stop the PCNET32 here -- it occasionally polls memory if we don't. */ 2375 /* We stop the PCNET32 here -- it occasionally polls memory if we don't. */
2260 lp->a.write_csr (ioaddr, 0, 0x0004); 2376 lp->a.write_csr(ioaddr, 0, 0x0004);
2261 2377
2262 /* 2378 /*
2263 * Switch back to 16bit mode to avoid problems with dumb 2379 * Switch back to 16bit mode to avoid problems with dumb
2264 * DOS packet driver after a warm reboot 2380 * DOS packet driver after a warm reboot
2265 */ 2381 */
2266 lp->a.write_bcr (ioaddr, 20, 4); 2382 lp->a.write_bcr(ioaddr, 20, 4);
2267 2383
2268 spin_unlock_irqrestore(&lp->lock, flags); 2384 spin_unlock_irqrestore(&lp->lock, flags);
2269 2385
2270 free_irq(dev->irq, dev); 2386 free_irq(dev->irq, dev);
2271 2387
2272 spin_lock_irqsave(&lp->lock, flags); 2388 spin_lock_irqsave(&lp->lock, flags);
2273 2389
2274 /* free all allocated skbuffs */ 2390 /* free all allocated skbuffs */
2275 for (i = 0; i < lp->rx_ring_size; i++) { 2391 for (i = 0; i < lp->rx_ring_size; i++) {
2276 lp->rx_ring[i].status = 0; 2392 lp->rx_ring[i].status = 0;
2277 wmb(); /* Make sure adapter sees owner change */ 2393 wmb(); /* Make sure adapter sees owner change */
2278 if (lp->rx_skbuff[i]) { 2394 if (lp->rx_skbuff[i]) {
2279 pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2, 2395 pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i],
2280 PCI_DMA_FROMDEVICE); 2396 PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE);
2281 dev_kfree_skb(lp->rx_skbuff[i]); 2397 dev_kfree_skb(lp->rx_skbuff[i]);
2398 }
2399 lp->rx_skbuff[i] = NULL;
2400 lp->rx_dma_addr[i] = 0;
2282 } 2401 }
2283 lp->rx_skbuff[i] = NULL;
2284 lp->rx_dma_addr[i] = 0;
2285 }
2286 2402
2287 for (i = 0; i < lp->tx_ring_size; i++) { 2403 for (i = 0; i < lp->tx_ring_size; i++) {
2288 lp->tx_ring[i].status = 0; /* CPU owns buffer */ 2404 lp->tx_ring[i].status = 0; /* CPU owns buffer */
2289 wmb(); /* Make sure adapter sees owner change */ 2405 wmb(); /* Make sure adapter sees owner change */
2290 if (lp->tx_skbuff[i]) { 2406 if (lp->tx_skbuff[i]) {
2291 pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i], 2407 pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
2292 lp->tx_skbuff[i]->len, PCI_DMA_TODEVICE); 2408 lp->tx_skbuff[i]->len,
2293 dev_kfree_skb(lp->tx_skbuff[i]); 2409 PCI_DMA_TODEVICE);
2410 dev_kfree_skb(lp->tx_skbuff[i]);
2411 }
2412 lp->tx_skbuff[i] = NULL;
2413 lp->tx_dma_addr[i] = 0;
2294 } 2414 }
2295 lp->tx_skbuff[i] = NULL;
2296 lp->tx_dma_addr[i] = 0;
2297 }
2298 2415
2299 spin_unlock_irqrestore(&lp->lock, flags); 2416 spin_unlock_irqrestore(&lp->lock, flags);
2300 2417
2301 return 0; 2418 return 0;
2302} 2419}
2303 2420
2304static struct net_device_stats * 2421static struct net_device_stats *pcnet32_get_stats(struct net_device *dev)
2305pcnet32_get_stats(struct net_device *dev)
2306{ 2422{
2307 struct pcnet32_private *lp = dev->priv; 2423 struct pcnet32_private *lp = dev->priv;
2308 unsigned long ioaddr = dev->base_addr; 2424 unsigned long ioaddr = dev->base_addr;
2309 u16 saved_addr; 2425 u16 saved_addr;
2310 unsigned long flags; 2426 unsigned long flags;
2311 2427
2312 spin_lock_irqsave(&lp->lock, flags); 2428 spin_lock_irqsave(&lp->lock, flags);
2313 saved_addr = lp->a.read_rap(ioaddr); 2429 saved_addr = lp->a.read_rap(ioaddr);
2314 lp->stats.rx_missed_errors = lp->a.read_csr (ioaddr, 112); 2430 lp->stats.rx_missed_errors = lp->a.read_csr(ioaddr, 112);
2315 lp->a.write_rap(ioaddr, saved_addr); 2431 lp->a.write_rap(ioaddr, saved_addr);
2316 spin_unlock_irqrestore(&lp->lock, flags); 2432 spin_unlock_irqrestore(&lp->lock, flags);
2317 2433
2318 return &lp->stats; 2434 return &lp->stats;
2319} 2435}
2320 2436
2321/* taken from the sunlance driver, which it took from the depca driver */ 2437/* taken from the sunlance driver, which it took from the depca driver */
2322static void pcnet32_load_multicast (struct net_device *dev) 2438static void pcnet32_load_multicast(struct net_device *dev)
2323{ 2439{
2324 struct pcnet32_private *lp = dev->priv; 2440 struct pcnet32_private *lp = dev->priv;
2325 volatile struct pcnet32_init_block *ib = &lp->init_block; 2441 volatile struct pcnet32_init_block *ib = &lp->init_block;
2326 volatile u16 *mcast_table = (u16 *)&ib->filter; 2442 volatile u16 *mcast_table = (u16 *) & ib->filter;
2327 struct dev_mc_list *dmi=dev->mc_list; 2443 struct dev_mc_list *dmi = dev->mc_list;
2328 char *addrs; 2444 char *addrs;
2329 int i; 2445 int i;
2330 u32 crc; 2446 u32 crc;
2331 2447
2332 /* set all multicast bits */ 2448 /* set all multicast bits */
2333 if (dev->flags & IFF_ALLMULTI) { 2449 if (dev->flags & IFF_ALLMULTI) {
2334 ib->filter[0] = 0xffffffff; 2450 ib->filter[0] = 0xffffffff;
2335 ib->filter[1] = 0xffffffff; 2451 ib->filter[1] = 0xffffffff;
2452 return;
2453 }
2454 /* clear the multicast filter */
2455 ib->filter[0] = 0;
2456 ib->filter[1] = 0;
2457
2458 /* Add addresses */
2459 for (i = 0; i < dev->mc_count; i++) {
2460 addrs = dmi->dmi_addr;
2461 dmi = dmi->next;
2462
2463 /* multicast address? */
2464 if (!(*addrs & 1))
2465 continue;
2466
2467 crc = ether_crc_le(6, addrs);
2468 crc = crc >> 26;
2469 mcast_table[crc >> 4] =
2470 le16_to_cpu(le16_to_cpu(mcast_table[crc >> 4]) |
2471 (1 << (crc & 0xf)));
2472 }
2336 return; 2473 return;
2337 }
2338 /* clear the multicast filter */
2339 ib->filter[0] = 0;
2340 ib->filter[1] = 0;
2341
2342 /* Add addresses */
2343 for (i = 0; i < dev->mc_count; i++) {
2344 addrs = dmi->dmi_addr;
2345 dmi = dmi->next;
2346
2347 /* multicast address? */
2348 if (!(*addrs & 1))
2349 continue;
2350
2351 crc = ether_crc_le(6, addrs);
2352 crc = crc >> 26;
2353 mcast_table [crc >> 4] = le16_to_cpu(
2354 le16_to_cpu(mcast_table [crc >> 4]) | (1 << (crc & 0xf)));
2355 }
2356 return;
2357} 2474}
2358 2475
2359
2360/* 2476/*
2361 * Set or clear the multicast filter for this adaptor. 2477 * Set or clear the multicast filter for this adaptor.
2362 */ 2478 */
2363static void pcnet32_set_multicast_list(struct net_device *dev) 2479static void pcnet32_set_multicast_list(struct net_device *dev)
2364{ 2480{
2365 unsigned long ioaddr = dev->base_addr, flags; 2481 unsigned long ioaddr = dev->base_addr, flags;
2366 struct pcnet32_private *lp = dev->priv; 2482 struct pcnet32_private *lp = dev->priv;
2367 2483
2368 spin_lock_irqsave(&lp->lock, flags); 2484 spin_lock_irqsave(&lp->lock, flags);
2369 if (dev->flags&IFF_PROMISC) { 2485 if (dev->flags & IFF_PROMISC) {
2370 /* Log any net taps. */ 2486 /* Log any net taps. */
2371 if (netif_msg_hw(lp)) 2487 if (netif_msg_hw(lp))
2372 printk(KERN_INFO "%s: Promiscuous mode enabled.\n", dev->name); 2488 printk(KERN_INFO "%s: Promiscuous mode enabled.\n",
2373 lp->init_block.mode = le16_to_cpu(0x8000 | (lp->options & PCNET32_PORT_PORTSEL) << 7); 2489 dev->name);
2374 } else { 2490 lp->init_block.mode =
2375 lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); 2491 le16_to_cpu(0x8000 | (lp->options & PCNET32_PORT_PORTSEL) <<
2376 pcnet32_load_multicast (dev); 2492 7);
2377 } 2493 } else {
2378 2494 lp->init_block.mode =
2379 lp->a.write_csr (ioaddr, 0, 0x0004); /* Temporarily stop the lance. */ 2495 le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
2380 pcnet32_restart(dev, 0x0042); /* Resume normal operation */ 2496 pcnet32_load_multicast(dev);
2381 netif_wake_queue(dev); 2497 }
2382 2498
2383 spin_unlock_irqrestore(&lp->lock, flags); 2499 lp->a.write_csr(ioaddr, 0, 0x0004); /* Temporarily stop the lance. */
2500 pcnet32_restart(dev, 0x0042); /* Resume normal operation */
2501 netif_wake_queue(dev);
2502
2503 spin_unlock_irqrestore(&lp->lock, flags);
2384} 2504}
2385 2505
2386/* This routine assumes that the lp->lock is held */ 2506/* This routine assumes that the lp->lock is held */
2387static int mdio_read(struct net_device *dev, int phy_id, int reg_num) 2507static int mdio_read(struct net_device *dev, int phy_id, int reg_num)
2388{ 2508{
2389 struct pcnet32_private *lp = dev->priv; 2509 struct pcnet32_private *lp = dev->priv;
2390 unsigned long ioaddr = dev->base_addr; 2510 unsigned long ioaddr = dev->base_addr;
2391 u16 val_out; 2511 u16 val_out;
2392 2512
2393 if (!lp->mii) 2513 if (!lp->mii)
2394 return 0; 2514 return 0;
2395 2515
2396 lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); 2516 lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f));
2397 val_out = lp->a.read_bcr(ioaddr, 34); 2517 val_out = lp->a.read_bcr(ioaddr, 34);
2398 2518
2399 return val_out; 2519 return val_out;
2400} 2520}
2401 2521
2402/* This routine assumes that the lp->lock is held */ 2522/* This routine assumes that the lp->lock is held */
2403static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val) 2523static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val)
2404{ 2524{
2405 struct pcnet32_private *lp = dev->priv; 2525 struct pcnet32_private *lp = dev->priv;
2406 unsigned long ioaddr = dev->base_addr; 2526 unsigned long ioaddr = dev->base_addr;
2407 2527
2408 if (!lp->mii) 2528 if (!lp->mii)
2409 return; 2529 return;
2410 2530
2411 lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); 2531 lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f));
2412 lp->a.write_bcr(ioaddr, 34, val); 2532 lp->a.write_bcr(ioaddr, 34, val);
2413} 2533}
2414 2534
2415static int pcnet32_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) 2535static int pcnet32_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
2416{ 2536{
2417 struct pcnet32_private *lp = dev->priv; 2537 struct pcnet32_private *lp = dev->priv;
2418 int rc; 2538 int rc;
2419 unsigned long flags; 2539 unsigned long flags;
2540
2541 /* SIOC[GS]MIIxxx ioctls */
2542 if (lp->mii) {
2543 spin_lock_irqsave(&lp->lock, flags);
2544 rc = generic_mii_ioctl(&lp->mii_if, if_mii(rq), cmd, NULL);
2545 spin_unlock_irqrestore(&lp->lock, flags);
2546 } else {
2547 rc = -EOPNOTSUPP;
2548 }
2549
2550 return rc;
2551}
2552
2553static int pcnet32_check_otherphy(struct net_device *dev)
2554{
2555 struct pcnet32_private *lp = dev->priv;
2556 struct mii_if_info mii = lp->mii_if;
2557 u16 bmcr;
2558 int i;
2420 2559
2421 /* SIOC[GS]MIIxxx ioctls */ 2560 for (i = 0; i < PCNET32_MAX_PHYS; i++) {
2422 if (lp->mii) { 2561 if (i == lp->mii_if.phy_id)
2423 spin_lock_irqsave(&lp->lock, flags); 2562 continue; /* skip active phy */
2424 rc = generic_mii_ioctl(&lp->mii_if, if_mii(rq), cmd, NULL); 2563 if (lp->phymask & (1 << i)) {
2425 spin_unlock_irqrestore(&lp->lock, flags); 2564 mii.phy_id = i;
2426 } else { 2565 if (mii_link_ok(&mii)) {
2427 rc = -EOPNOTSUPP; 2566 /* found PHY with active link */
2428 } 2567 if (netif_msg_link(lp))
2568 printk(KERN_INFO
2569 "%s: Using PHY number %d.\n",
2570 dev->name, i);
2571
2572 /* isolate inactive phy */
2573 bmcr =
2574 mdio_read(dev, lp->mii_if.phy_id, MII_BMCR);
2575 mdio_write(dev, lp->mii_if.phy_id, MII_BMCR,
2576 bmcr | BMCR_ISOLATE);
2577
2578 /* de-isolate new phy */
2579 bmcr = mdio_read(dev, i, MII_BMCR);
2580 mdio_write(dev, i, MII_BMCR,
2581 bmcr & ~BMCR_ISOLATE);
2582
2583 /* set new phy address */
2584 lp->mii_if.phy_id = i;
2585 return 1;
2586 }
2587 }
2588 }
2589 return 0;
2590}
2591
2592/*
2593 * Show the status of the media. Similar to mii_check_media however it
2594 * correctly shows the link speed for all (tested) pcnet32 variants.
2595 * Devices with no mii just report link state without speed.
2596 *
2597 * Caller is assumed to hold and release the lp->lock.
2598 */
2429 2599
2430 return rc; 2600static void pcnet32_check_media(struct net_device *dev, int verbose)
2601{
2602 struct pcnet32_private *lp = dev->priv;
2603 int curr_link;
2604 int prev_link = netif_carrier_ok(dev) ? 1 : 0;
2605 u32 bcr9;
2606
2607 if (lp->mii) {
2608 curr_link = mii_link_ok(&lp->mii_if);
2609 } else {
2610 ulong ioaddr = dev->base_addr; /* card base I/O address */
2611 curr_link = (lp->a.read_bcr(ioaddr, 4) != 0xc0);
2612 }
2613 if (!curr_link) {
2614 if (prev_link || verbose) {
2615 netif_carrier_off(dev);
2616 if (netif_msg_link(lp))
2617 printk(KERN_INFO "%s: link down\n", dev->name);
2618 }
2619 if (lp->phycount > 1) {
2620 curr_link = pcnet32_check_otherphy(dev);
2621 prev_link = 0;
2622 }
2623 } else if (verbose || !prev_link) {
2624 netif_carrier_on(dev);
2625 if (lp->mii) {
2626 if (netif_msg_link(lp)) {
2627 struct ethtool_cmd ecmd;
2628 mii_ethtool_gset(&lp->mii_if, &ecmd);
2629 printk(KERN_INFO
2630 "%s: link up, %sMbps, %s-duplex\n",
2631 dev->name,
2632 (ecmd.speed == SPEED_100) ? "100" : "10",
2633 (ecmd.duplex ==
2634 DUPLEX_FULL) ? "full" : "half");
2635 }
2636 bcr9 = lp->a.read_bcr(dev->base_addr, 9);
2637 if ((bcr9 & (1 << 0)) != lp->mii_if.full_duplex) {
2638 if (lp->mii_if.full_duplex)
2639 bcr9 |= (1 << 0);
2640 else
2641 bcr9 &= ~(1 << 0);
2642 lp->a.write_bcr(dev->base_addr, 9, bcr9);
2643 }
2644 } else {
2645 if (netif_msg_link(lp))
2646 printk(KERN_INFO "%s: link up\n", dev->name);
2647 }
2648 }
2431} 2649}
2432 2650
2651/*
2652 * Check for loss of link and link establishment.
2653 * Can not use mii_check_media because it does nothing if mode is forced.
2654 */
2655
2433static void pcnet32_watchdog(struct net_device *dev) 2656static void pcnet32_watchdog(struct net_device *dev)
2434{ 2657{
2435 struct pcnet32_private *lp = dev->priv; 2658 struct pcnet32_private *lp = dev->priv;
2436 unsigned long flags; 2659 unsigned long flags;
2437 2660
2438 /* Print the link status if it has changed */ 2661 /* Print the link status if it has changed */
2439 if (lp->mii) {
2440 spin_lock_irqsave(&lp->lock, flags); 2662 spin_lock_irqsave(&lp->lock, flags);
2441 mii_check_media (&lp->mii_if, netif_msg_link(lp), 0); 2663 pcnet32_check_media(dev, 0);
2442 spin_unlock_irqrestore(&lp->lock, flags); 2664 spin_unlock_irqrestore(&lp->lock, flags);
2443 }
2444 2665
2445 mod_timer (&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT); 2666 mod_timer(&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT);
2446} 2667}
2447 2668
2448static void __devexit pcnet32_remove_one(struct pci_dev *pdev) 2669static void __devexit pcnet32_remove_one(struct pci_dev *pdev)
2449{ 2670{
2450 struct net_device *dev = pci_get_drvdata(pdev); 2671 struct net_device *dev = pci_get_drvdata(pdev);
2451 2672
2452 if (dev) { 2673 if (dev) {
2453 struct pcnet32_private *lp = dev->priv; 2674 struct pcnet32_private *lp = dev->priv;
2454 2675
2455 unregister_netdev(dev); 2676 unregister_netdev(dev);
2456 pcnet32_free_ring(dev); 2677 pcnet32_free_ring(dev);
2457 release_region(dev->base_addr, PCNET32_TOTAL_SIZE); 2678 release_region(dev->base_addr, PCNET32_TOTAL_SIZE);
2458 pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); 2679 pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
2459 free_netdev(dev); 2680 free_netdev(dev);
2460 pci_disable_device(pdev); 2681 pci_disable_device(pdev);
2461 pci_set_drvdata(pdev, NULL); 2682 pci_set_drvdata(pdev, NULL);
2462 } 2683 }
2463} 2684}
2464 2685
2465static struct pci_driver pcnet32_driver = { 2686static struct pci_driver pcnet32_driver = {
2466 .name = DRV_NAME, 2687 .name = DRV_NAME,
2467 .probe = pcnet32_probe_pci, 2688 .probe = pcnet32_probe_pci,
2468 .remove = __devexit_p(pcnet32_remove_one), 2689 .remove = __devexit_p(pcnet32_remove_one),
2469 .id_table = pcnet32_pci_tbl, 2690 .id_table = pcnet32_pci_tbl,
2470}; 2691};
2471 2692
2472/* An additional parameter that may be passed in... */ 2693/* An additional parameter that may be passed in... */
@@ -2477,9 +2698,11 @@ static int pcnet32_have_pci;
2477module_param(debug, int, 0); 2698module_param(debug, int, 0);
2478MODULE_PARM_DESC(debug, DRV_NAME " debug level"); 2699MODULE_PARM_DESC(debug, DRV_NAME " debug level");
2479module_param(max_interrupt_work, int, 0); 2700module_param(max_interrupt_work, int, 0);
2480MODULE_PARM_DESC(max_interrupt_work, DRV_NAME " maximum events handled per interrupt"); 2701MODULE_PARM_DESC(max_interrupt_work,
2702 DRV_NAME " maximum events handled per interrupt");
2481module_param(rx_copybreak, int, 0); 2703module_param(rx_copybreak, int, 0);
2482MODULE_PARM_DESC(rx_copybreak, DRV_NAME " copy breakpoint for copy-only-tiny-frames"); 2704MODULE_PARM_DESC(rx_copybreak,
2705 DRV_NAME " copy breakpoint for copy-only-tiny-frames");
2483module_param(tx_start_pt, int, 0); 2706module_param(tx_start_pt, int, 0);
2484MODULE_PARM_DESC(tx_start_pt, DRV_NAME " transmit start point (0-3)"); 2707MODULE_PARM_DESC(tx_start_pt, DRV_NAME " transmit start point (0-3)");
2485module_param(pcnet32vlb, int, 0); 2708module_param(pcnet32vlb, int, 0);
@@ -2490,7 +2713,9 @@ module_param_array(full_duplex, int, NULL, 0);
2490MODULE_PARM_DESC(full_duplex, DRV_NAME " full duplex setting(s) (1)"); 2713MODULE_PARM_DESC(full_duplex, DRV_NAME " full duplex setting(s) (1)");
2491/* Module Parameter for HomePNA cards added by Patrick Simmons, 2004 */ 2714/* Module Parameter for HomePNA cards added by Patrick Simmons, 2004 */
2492module_param_array(homepna, int, NULL, 0); 2715module_param_array(homepna, int, NULL, 0);
2493MODULE_PARM_DESC(homepna, DRV_NAME " mode for 79C978 cards (1 for HomePNA, 0 for Ethernet, default Ethernet"); 2716MODULE_PARM_DESC(homepna,
2717 DRV_NAME
2718 " mode for 79C978 cards (1 for HomePNA, 0 for Ethernet, default Ethernet");
2494 2719
2495MODULE_AUTHOR("Thomas Bogendoerfer"); 2720MODULE_AUTHOR("Thomas Bogendoerfer");
2496MODULE_DESCRIPTION("Driver for PCnet32 and PCnetPCI based ethercards"); 2721MODULE_DESCRIPTION("Driver for PCnet32 and PCnetPCI based ethercards");
@@ -2500,44 +2725,44 @@ MODULE_LICENSE("GPL");
2500 2725
2501static int __init pcnet32_init_module(void) 2726static int __init pcnet32_init_module(void)
2502{ 2727{
2503 printk(KERN_INFO "%s", version); 2728 printk(KERN_INFO "%s", version);
2504 2729
2505 pcnet32_debug = netif_msg_init(debug, PCNET32_MSG_DEFAULT); 2730 pcnet32_debug = netif_msg_init(debug, PCNET32_MSG_DEFAULT);
2506 2731
2507 if ((tx_start_pt >= 0) && (tx_start_pt <= 3)) 2732 if ((tx_start_pt >= 0) && (tx_start_pt <= 3))
2508 tx_start = tx_start_pt; 2733 tx_start = tx_start_pt;
2509 2734
2510 /* find the PCI devices */ 2735 /* find the PCI devices */
2511 if (!pci_module_init(&pcnet32_driver)) 2736 if (!pci_module_init(&pcnet32_driver))
2512 pcnet32_have_pci = 1; 2737 pcnet32_have_pci = 1;
2513 2738
2514 /* should we find any remaining VLbus devices ? */ 2739 /* should we find any remaining VLbus devices ? */
2515 if (pcnet32vlb) 2740 if (pcnet32vlb)
2516 pcnet32_probe_vlbus(); 2741 pcnet32_probe_vlbus();
2517 2742
2518 if (cards_found && (pcnet32_debug & NETIF_MSG_PROBE)) 2743 if (cards_found && (pcnet32_debug & NETIF_MSG_PROBE))
2519 printk(KERN_INFO PFX "%d cards_found.\n", cards_found); 2744 printk(KERN_INFO PFX "%d cards_found.\n", cards_found);
2520 2745
2521 return (pcnet32_have_pci + cards_found) ? 0 : -ENODEV; 2746 return (pcnet32_have_pci + cards_found) ? 0 : -ENODEV;
2522} 2747}
2523 2748
2524static void __exit pcnet32_cleanup_module(void) 2749static void __exit pcnet32_cleanup_module(void)
2525{ 2750{
2526 struct net_device *next_dev; 2751 struct net_device *next_dev;
2527 2752
2528 while (pcnet32_dev) { 2753 while (pcnet32_dev) {
2529 struct pcnet32_private *lp = pcnet32_dev->priv; 2754 struct pcnet32_private *lp = pcnet32_dev->priv;
2530 next_dev = lp->next; 2755 next_dev = lp->next;
2531 unregister_netdev(pcnet32_dev); 2756 unregister_netdev(pcnet32_dev);
2532 pcnet32_free_ring(pcnet32_dev); 2757 pcnet32_free_ring(pcnet32_dev);
2533 release_region(pcnet32_dev->base_addr, PCNET32_TOTAL_SIZE); 2758 release_region(pcnet32_dev->base_addr, PCNET32_TOTAL_SIZE);
2534 pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); 2759 pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
2535 free_netdev(pcnet32_dev); 2760 free_netdev(pcnet32_dev);
2536 pcnet32_dev = next_dev; 2761 pcnet32_dev = next_dev;
2537 } 2762 }
2538 2763
2539 if (pcnet32_have_pci) 2764 if (pcnet32_have_pci)
2540 pci_unregister_driver(&pcnet32_driver); 2765 pci_unregister_driver(&pcnet32_driver);
2541} 2766}
2542 2767
2543module_init(pcnet32_init_module); 2768module_init(pcnet32_init_module);
diff --git a/drivers/net/skfp/fplustm.c b/drivers/net/skfp/fplustm.c
index a4b2b6975d6c..0784f558ca9a 100644
--- a/drivers/net/skfp/fplustm.c
+++ b/drivers/net/skfp/fplustm.c
@@ -549,12 +549,12 @@ void formac_tx_restart(struct s_smc *smc)
549static void enable_formac(struct s_smc *smc) 549static void enable_formac(struct s_smc *smc)
550{ 550{
551 /* set formac IMSK : 0 enables irq */ 551 /* set formac IMSK : 0 enables irq */
552 outpw(FM_A(FM_IMSK1U),~mac_imsk1u) ; 552 outpw(FM_A(FM_IMSK1U),(unsigned short)~mac_imsk1u);
553 outpw(FM_A(FM_IMSK1L),~mac_imsk1l) ; 553 outpw(FM_A(FM_IMSK1L),(unsigned short)~mac_imsk1l);
554 outpw(FM_A(FM_IMSK2U),~mac_imsk2u) ; 554 outpw(FM_A(FM_IMSK2U),(unsigned short)~mac_imsk2u);
555 outpw(FM_A(FM_IMSK2L),~mac_imsk2l) ; 555 outpw(FM_A(FM_IMSK2L),(unsigned short)~mac_imsk2l);
556 outpw(FM_A(FM_IMSK3U),~mac_imsk3u) ; 556 outpw(FM_A(FM_IMSK3U),(unsigned short)~mac_imsk3u);
557 outpw(FM_A(FM_IMSK3L),~mac_imsk3l) ; 557 outpw(FM_A(FM_IMSK3L),(unsigned short)~mac_imsk3l);
558} 558}
559 559
560#if 0 /* Removed because the driver should use the ASICs TX complete IRQ. */ 560#if 0 /* Removed because the driver should use the ASICs TX complete IRQ. */
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 25e028b7ce48..4eda81d41b10 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -44,7 +44,7 @@
44#include "skge.h" 44#include "skge.h"
45 45
46#define DRV_NAME "skge" 46#define DRV_NAME "skge"
47#define DRV_VERSION "1.3" 47#define DRV_VERSION "1.4"
48#define PFX DRV_NAME " " 48#define PFX DRV_NAME " "
49 49
50#define DEFAULT_TX_RING_SIZE 128 50#define DEFAULT_TX_RING_SIZE 128
@@ -104,7 +104,6 @@ static const int txqaddr[] = { Q_XA1, Q_XA2 };
104static const int rxqaddr[] = { Q_R1, Q_R2 }; 104static const int rxqaddr[] = { Q_R1, Q_R2 };
105static const u32 rxirqmask[] = { IS_R1_F, IS_R2_F }; 105static const u32 rxirqmask[] = { IS_R1_F, IS_R2_F };
106static const u32 txirqmask[] = { IS_XA1_F, IS_XA2_F }; 106static const u32 txirqmask[] = { IS_XA1_F, IS_XA2_F };
107static const u32 portirqmask[] = { IS_PORT_1, IS_PORT_2 };
108 107
109static int skge_get_regs_len(struct net_device *dev) 108static int skge_get_regs_len(struct net_device *dev)
110{ 109{
@@ -728,19 +727,18 @@ static struct ethtool_ops skge_ethtool_ops = {
728 * Allocate ring elements and chain them together 727 * Allocate ring elements and chain them together
729 * One-to-one association of board descriptors with ring elements 728 * One-to-one association of board descriptors with ring elements
730 */ 729 */
731static int skge_ring_alloc(struct skge_ring *ring, void *vaddr, u64 base) 730static int skge_ring_alloc(struct skge_ring *ring, void *vaddr, u32 base)
732{ 731{
733 struct skge_tx_desc *d; 732 struct skge_tx_desc *d;
734 struct skge_element *e; 733 struct skge_element *e;
735 int i; 734 int i;
736 735
737 ring->start = kmalloc(sizeof(*e)*ring->count, GFP_KERNEL); 736 ring->start = kcalloc(sizeof(*e), ring->count, GFP_KERNEL);
738 if (!ring->start) 737 if (!ring->start)
739 return -ENOMEM; 738 return -ENOMEM;
740 739
741 for (i = 0, e = ring->start, d = vaddr; i < ring->count; i++, e++, d++) { 740 for (i = 0, e = ring->start, d = vaddr; i < ring->count; i++, e++, d++) {
742 e->desc = d; 741 e->desc = d;
743 e->skb = NULL;
744 if (i == ring->count - 1) { 742 if (i == ring->count - 1) {
745 e->next = ring->start; 743 e->next = ring->start;
746 d->next_offset = base; 744 d->next_offset = base;
@@ -2169,27 +2167,31 @@ static int skge_up(struct net_device *dev)
2169 if (!skge->mem) 2167 if (!skge->mem)
2170 return -ENOMEM; 2168 return -ENOMEM;
2171 2169
2170 BUG_ON(skge->dma & 7);
2171
2172 if ((u64)skge->dma >> 32 != ((u64) skge->dma + skge->mem_size) >> 32) {
2173 printk(KERN_ERR PFX "pci_alloc_consistent region crosses 4G boundary\n");
2174 err = -EINVAL;
2175 goto free_pci_mem;
2176 }
2177
2172 memset(skge->mem, 0, skge->mem_size); 2178 memset(skge->mem, 0, skge->mem_size);
2173 2179
2174 if ((err = skge_ring_alloc(&skge->rx_ring, skge->mem, skge->dma))) 2180 err = skge_ring_alloc(&skge->rx_ring, skge->mem, skge->dma);
2181 if (err)
2175 goto free_pci_mem; 2182 goto free_pci_mem;
2176 2183
2177 err = skge_rx_fill(skge); 2184 err = skge_rx_fill(skge);
2178 if (err) 2185 if (err)
2179 goto free_rx_ring; 2186 goto free_rx_ring;
2180 2187
2181 if ((err = skge_ring_alloc(&skge->tx_ring, skge->mem + rx_size, 2188 err = skge_ring_alloc(&skge->tx_ring, skge->mem + rx_size,
2182 skge->dma + rx_size))) 2189 skge->dma + rx_size);
2190 if (err)
2183 goto free_rx_ring; 2191 goto free_rx_ring;
2184 2192
2185 skge->tx_avail = skge->tx_ring.count - 1; 2193 skge->tx_avail = skge->tx_ring.count - 1;
2186 2194
2187 /* Enable IRQ from port */
2188 spin_lock_irq(&hw->hw_lock);
2189 hw->intr_mask |= portirqmask[port];
2190 skge_write32(hw, B0_IMSK, hw->intr_mask);
2191 spin_unlock_irq(&hw->hw_lock);
2192
2193 /* Initialize MAC */ 2195 /* Initialize MAC */
2194 spin_lock_bh(&hw->phy_lock); 2196 spin_lock_bh(&hw->phy_lock);
2195 if (hw->chip_id == CHIP_ID_GENESIS) 2197 if (hw->chip_id == CHIP_ID_GENESIS)
@@ -2246,11 +2248,6 @@ static int skge_down(struct net_device *dev)
2246 else 2248 else
2247 yukon_stop(skge); 2249 yukon_stop(skge);
2248 2250
2249 spin_lock_irq(&hw->hw_lock);
2250 hw->intr_mask &= ~portirqmask[skge->port];
2251 skge_write32(hw, B0_IMSK, hw->intr_mask);
2252 spin_unlock_irq(&hw->hw_lock);
2253
2254 /* Stop transmitter */ 2251 /* Stop transmitter */
2255 skge_write8(hw, Q_ADDR(txqaddr[port], Q_CSR), CSR_STOP); 2252 skge_write8(hw, Q_ADDR(txqaddr[port], Q_CSR), CSR_STOP);
2256 skge_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL), 2253 skge_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL),
@@ -2307,18 +2304,15 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
2307 int i; 2304 int i;
2308 u32 control, len; 2305 u32 control, len;
2309 u64 map; 2306 u64 map;
2310 unsigned long flags;
2311 2307
2312 skb = skb_padto(skb, ETH_ZLEN); 2308 skb = skb_padto(skb, ETH_ZLEN);
2313 if (!skb) 2309 if (!skb)
2314 return NETDEV_TX_OK; 2310 return NETDEV_TX_OK;
2315 2311
2316 local_irq_save(flags);
2317 if (!spin_trylock(&skge->tx_lock)) { 2312 if (!spin_trylock(&skge->tx_lock)) {
2318 /* Collision - tell upper layer to requeue */ 2313 /* Collision - tell upper layer to requeue */
2319 local_irq_restore(flags); 2314 return NETDEV_TX_LOCKED;
2320 return NETDEV_TX_LOCKED; 2315 }
2321 }
2322 2316
2323 if (unlikely(skge->tx_avail < skb_shinfo(skb)->nr_frags +1)) { 2317 if (unlikely(skge->tx_avail < skb_shinfo(skb)->nr_frags +1)) {
2324 if (!netif_queue_stopped(dev)) { 2318 if (!netif_queue_stopped(dev)) {
@@ -2327,7 +2321,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
2327 printk(KERN_WARNING PFX "%s: ring full when queue awake!\n", 2321 printk(KERN_WARNING PFX "%s: ring full when queue awake!\n",
2328 dev->name); 2322 dev->name);
2329 } 2323 }
2330 spin_unlock_irqrestore(&skge->tx_lock, flags); 2324 spin_unlock(&skge->tx_lock);
2331 return NETDEV_TX_BUSY; 2325 return NETDEV_TX_BUSY;
2332 } 2326 }
2333 2327
@@ -2402,8 +2396,10 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
2402 netif_stop_queue(dev); 2396 netif_stop_queue(dev);
2403 } 2397 }
2404 2398
2399 mmiowb();
2400 spin_unlock(&skge->tx_lock);
2401
2405 dev->trans_start = jiffies; 2402 dev->trans_start = jiffies;
2406 spin_unlock_irqrestore(&skge->tx_lock, flags);
2407 2403
2408 return NETDEV_TX_OK; 2404 return NETDEV_TX_OK;
2409} 2405}
@@ -2416,7 +2412,7 @@ static inline void skge_tx_free(struct skge_hw *hw, struct skge_element *e)
2416 pci_unmap_addr(e, mapaddr), 2412 pci_unmap_addr(e, mapaddr),
2417 pci_unmap_len(e, maplen), 2413 pci_unmap_len(e, maplen),
2418 PCI_DMA_TODEVICE); 2414 PCI_DMA_TODEVICE);
2419 dev_kfree_skb_any(e->skb); 2415 dev_kfree_skb(e->skb);
2420 e->skb = NULL; 2416 e->skb = NULL;
2421 } else { 2417 } else {
2422 pci_unmap_page(hw->pdev, 2418 pci_unmap_page(hw->pdev,
@@ -2430,15 +2426,14 @@ static void skge_tx_clean(struct skge_port *skge)
2430{ 2426{
2431 struct skge_ring *ring = &skge->tx_ring; 2427 struct skge_ring *ring = &skge->tx_ring;
2432 struct skge_element *e; 2428 struct skge_element *e;
2433 unsigned long flags;
2434 2429
2435 spin_lock_irqsave(&skge->tx_lock, flags); 2430 spin_lock_bh(&skge->tx_lock);
2436 for (e = ring->to_clean; e != ring->to_use; e = e->next) { 2431 for (e = ring->to_clean; e != ring->to_use; e = e->next) {
2437 ++skge->tx_avail; 2432 ++skge->tx_avail;
2438 skge_tx_free(skge->hw, e); 2433 skge_tx_free(skge->hw, e);
2439 } 2434 }
2440 ring->to_clean = e; 2435 ring->to_clean = e;
2441 spin_unlock_irqrestore(&skge->tx_lock, flags); 2436 spin_unlock_bh(&skge->tx_lock);
2442} 2437}
2443 2438
2444static void skge_tx_timeout(struct net_device *dev) 2439static void skge_tx_timeout(struct net_device *dev)
@@ -2663,6 +2658,37 @@ resubmit:
2663 return NULL; 2658 return NULL;
2664} 2659}
2665 2660
2661static void skge_tx_done(struct skge_port *skge)
2662{
2663 struct skge_ring *ring = &skge->tx_ring;
2664 struct skge_element *e;
2665
2666 spin_lock(&skge->tx_lock);
2667 for (e = ring->to_clean; prefetch(e->next), e != ring->to_use; e = e->next) {
2668 struct skge_tx_desc *td = e->desc;
2669 u32 control;
2670
2671 rmb();
2672 control = td->control;
2673 if (control & BMU_OWN)
2674 break;
2675
2676 if (unlikely(netif_msg_tx_done(skge)))
2677 printk(KERN_DEBUG PFX "%s: tx done slot %td status 0x%x\n",
2678 skge->netdev->name, e - ring->start, td->status);
2679
2680 skge_tx_free(skge->hw, e);
2681 e->skb = NULL;
2682 ++skge->tx_avail;
2683 }
2684 ring->to_clean = e;
2685 skge_write8(skge->hw, Q_ADDR(txqaddr[skge->port], Q_CSR), CSR_IRQ_CL_F);
2686
2687 if (skge->tx_avail > MAX_SKB_FRAGS + 1)
2688 netif_wake_queue(skge->netdev);
2689
2690 spin_unlock(&skge->tx_lock);
2691}
2666 2692
2667static int skge_poll(struct net_device *dev, int *budget) 2693static int skge_poll(struct net_device *dev, int *budget)
2668{ 2694{
@@ -2670,8 +2696,10 @@ static int skge_poll(struct net_device *dev, int *budget)
2670 struct skge_hw *hw = skge->hw; 2696 struct skge_hw *hw = skge->hw;
2671 struct skge_ring *ring = &skge->rx_ring; 2697 struct skge_ring *ring = &skge->rx_ring;
2672 struct skge_element *e; 2698 struct skge_element *e;
2673 unsigned int to_do = min(dev->quota, *budget); 2699 int to_do = min(dev->quota, *budget);
2674 unsigned int work_done = 0; 2700 int work_done = 0;
2701
2702 skge_tx_done(skge);
2675 2703
2676 for (e = ring->to_clean; prefetch(e->next), work_done < to_do; e = e->next) { 2704 for (e = ring->to_clean; prefetch(e->next), work_done < to_do; e = e->next) {
2677 struct skge_rx_desc *rd = e->desc; 2705 struct skge_rx_desc *rd = e->desc;
@@ -2683,8 +2711,8 @@ static int skge_poll(struct net_device *dev, int *budget)
2683 if (control & BMU_OWN) 2711 if (control & BMU_OWN)
2684 break; 2712 break;
2685 2713
2686 skb = skge_rx_get(skge, e, control, rd->status, 2714 skb = skge_rx_get(skge, e, control, rd->status,
2687 le16_to_cpu(rd->csum2)); 2715 le16_to_cpu(rd->csum2));
2688 if (likely(skb)) { 2716 if (likely(skb)) {
2689 dev->last_rx = jiffies; 2717 dev->last_rx = jiffies;
2690 netif_receive_skb(skb); 2718 netif_receive_skb(skb);
@@ -2705,49 +2733,15 @@ static int skge_poll(struct net_device *dev, int *budget)
2705 if (work_done >= to_do) 2733 if (work_done >= to_do)
2706 return 1; /* not done */ 2734 return 1; /* not done */
2707 2735
2708 spin_lock_irq(&hw->hw_lock); 2736 netif_rx_complete(dev);
2709 __netif_rx_complete(dev); 2737 mmiowb();
2710 hw->intr_mask |= portirqmask[skge->port]; 2738
2739 hw->intr_mask |= skge->port == 0 ? (IS_R1_F|IS_XA1_F) : (IS_R2_F|IS_XA2_F);
2711 skge_write32(hw, B0_IMSK, hw->intr_mask); 2740 skge_write32(hw, B0_IMSK, hw->intr_mask);
2712 spin_unlock_irq(&hw->hw_lock);
2713 2741
2714 return 0; 2742 return 0;
2715} 2743}
2716 2744
2717static inline void skge_tx_intr(struct net_device *dev)
2718{
2719 struct skge_port *skge = netdev_priv(dev);
2720 struct skge_hw *hw = skge->hw;
2721 struct skge_ring *ring = &skge->tx_ring;
2722 struct skge_element *e;
2723
2724 spin_lock(&skge->tx_lock);
2725 for (e = ring->to_clean; prefetch(e->next), e != ring->to_use; e = e->next) {
2726 struct skge_tx_desc *td = e->desc;
2727 u32 control;
2728
2729 rmb();
2730 control = td->control;
2731 if (control & BMU_OWN)
2732 break;
2733
2734 if (unlikely(netif_msg_tx_done(skge)))
2735 printk(KERN_DEBUG PFX "%s: tx done slot %td status 0x%x\n",
2736 dev->name, e - ring->start, td->status);
2737
2738 skge_tx_free(hw, e);
2739 e->skb = NULL;
2740 ++skge->tx_avail;
2741 }
2742 ring->to_clean = e;
2743 skge_write8(hw, Q_ADDR(txqaddr[skge->port], Q_CSR), CSR_IRQ_CL_F);
2744
2745 if (skge->tx_avail > MAX_SKB_FRAGS + 1)
2746 netif_wake_queue(dev);
2747
2748 spin_unlock(&skge->tx_lock);
2749}
2750
2751/* Parity errors seem to happen when Genesis is connected to a switch 2745/* Parity errors seem to happen when Genesis is connected to a switch
2752 * with no other ports present. Heartbeat error?? 2746 * with no other ports present. Heartbeat error??
2753 */ 2747 */
@@ -2770,17 +2764,6 @@ static void skge_mac_parity(struct skge_hw *hw, int port)
2770 ? GMF_CLI_TX_FC : GMF_CLI_TX_PE); 2764 ? GMF_CLI_TX_FC : GMF_CLI_TX_PE);
2771} 2765}
2772 2766
2773static void skge_pci_clear(struct skge_hw *hw)
2774{
2775 u16 status;
2776
2777 pci_read_config_word(hw->pdev, PCI_STATUS, &status);
2778 skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
2779 pci_write_config_word(hw->pdev, PCI_STATUS,
2780 status | PCI_STATUS_ERROR_BITS);
2781 skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
2782}
2783
2784static void skge_mac_intr(struct skge_hw *hw, int port) 2767static void skge_mac_intr(struct skge_hw *hw, int port)
2785{ 2768{
2786 if (hw->chip_id == CHIP_ID_GENESIS) 2769 if (hw->chip_id == CHIP_ID_GENESIS)
@@ -2822,23 +2805,39 @@ static void skge_error_irq(struct skge_hw *hw)
2822 if (hwstatus & IS_M2_PAR_ERR) 2805 if (hwstatus & IS_M2_PAR_ERR)
2823 skge_mac_parity(hw, 1); 2806 skge_mac_parity(hw, 1);
2824 2807
2825 if (hwstatus & IS_R1_PAR_ERR) 2808 if (hwstatus & IS_R1_PAR_ERR) {
2809 printk(KERN_ERR PFX "%s: receive queue parity error\n",
2810 hw->dev[0]->name);
2826 skge_write32(hw, B0_R1_CSR, CSR_IRQ_CL_P); 2811 skge_write32(hw, B0_R1_CSR, CSR_IRQ_CL_P);
2812 }
2827 2813
2828 if (hwstatus & IS_R2_PAR_ERR) 2814 if (hwstatus & IS_R2_PAR_ERR) {
2815 printk(KERN_ERR PFX "%s: receive queue parity error\n",
2816 hw->dev[1]->name);
2829 skge_write32(hw, B0_R2_CSR, CSR_IRQ_CL_P); 2817 skge_write32(hw, B0_R2_CSR, CSR_IRQ_CL_P);
2818 }
2830 2819
2831 if (hwstatus & (IS_IRQ_MST_ERR|IS_IRQ_STAT)) { 2820 if (hwstatus & (IS_IRQ_MST_ERR|IS_IRQ_STAT)) {
2832 printk(KERN_ERR PFX "hardware error detected (status 0x%x)\n", 2821 u16 pci_status, pci_cmd;
2833 hwstatus); 2822
2823 pci_read_config_word(hw->pdev, PCI_COMMAND, &pci_cmd);
2824 pci_read_config_word(hw->pdev, PCI_STATUS, &pci_status);
2834 2825
2835 skge_pci_clear(hw); 2826 printk(KERN_ERR PFX "%s: PCI error cmd=%#x status=%#x\n",
2827 pci_name(hw->pdev), pci_cmd, pci_status);
2828
2829 /* Write the error bits back to clear them. */
2830 pci_status &= PCI_STATUS_ERROR_BITS;
2831 skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
2832 pci_write_config_word(hw->pdev, PCI_COMMAND,
2833 pci_cmd | PCI_COMMAND_SERR | PCI_COMMAND_PARITY);
2834 pci_write_config_word(hw->pdev, PCI_STATUS, pci_status);
2835 skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
2836 2836
2837 /* if error still set then just ignore it */ 2837 /* if error still set then just ignore it */
2838 hwstatus = skge_read32(hw, B0_HWE_ISRC); 2838 hwstatus = skge_read32(hw, B0_HWE_ISRC);
2839 if (hwstatus & IS_IRQ_STAT) { 2839 if (hwstatus & IS_IRQ_STAT) {
2840 pr_debug("IRQ status %x: still set ignoring hardware errors\n", 2840 printk(KERN_INFO PFX "unable to clear error (so ignoring them)\n");
2841 hwstatus);
2842 hw->intr_mask &= ~IS_HW_ERR; 2841 hw->intr_mask &= ~IS_HW_ERR;
2843 } 2842 }
2844 } 2843 }
@@ -2855,12 +2854,11 @@ static void skge_extirq(unsigned long data)
2855 int port; 2854 int port;
2856 2855
2857 spin_lock(&hw->phy_lock); 2856 spin_lock(&hw->phy_lock);
2858 for (port = 0; port < 2; port++) { 2857 for (port = 0; port < hw->ports; port++) {
2859 struct net_device *dev = hw->dev[port]; 2858 struct net_device *dev = hw->dev[port];
2859 struct skge_port *skge = netdev_priv(dev);
2860 2860
2861 if (dev && netif_running(dev)) { 2861 if (netif_running(dev)) {
2862 struct skge_port *skge = netdev_priv(dev);
2863
2864 if (hw->chip_id != CHIP_ID_GENESIS) 2862 if (hw->chip_id != CHIP_ID_GENESIS)
2865 yukon_phy_intr(skge); 2863 yukon_phy_intr(skge);
2866 else 2864 else
@@ -2869,38 +2867,39 @@ static void skge_extirq(unsigned long data)
2869 } 2867 }
2870 spin_unlock(&hw->phy_lock); 2868 spin_unlock(&hw->phy_lock);
2871 2869
2872 spin_lock_irq(&hw->hw_lock);
2873 hw->intr_mask |= IS_EXT_REG; 2870 hw->intr_mask |= IS_EXT_REG;
2874 skge_write32(hw, B0_IMSK, hw->intr_mask); 2871 skge_write32(hw, B0_IMSK, hw->intr_mask);
2875 spin_unlock_irq(&hw->hw_lock);
2876} 2872}
2877 2873
2878static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs) 2874static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs)
2879{ 2875{
2880 struct skge_hw *hw = dev_id; 2876 struct skge_hw *hw = dev_id;
2881 u32 status = skge_read32(hw, B0_SP_ISRC); 2877 u32 status;
2882 2878
2883 if (status == 0 || status == ~0) /* hotplug or shared irq */ 2879 /* Reading this register masks IRQ */
2880 status = skge_read32(hw, B0_SP_ISRC);
2881 if (status == 0)
2884 return IRQ_NONE; 2882 return IRQ_NONE;
2885 2883
2886 spin_lock(&hw->hw_lock); 2884 if (status & IS_EXT_REG) {
2887 if (status & IS_R1_F) { 2885 hw->intr_mask &= ~IS_EXT_REG;
2886 tasklet_schedule(&hw->ext_tasklet);
2887 }
2888
2889 if (status & (IS_R1_F|IS_XA1_F)) {
2888 skge_write8(hw, Q_ADDR(Q_R1, Q_CSR), CSR_IRQ_CL_F); 2890 skge_write8(hw, Q_ADDR(Q_R1, Q_CSR), CSR_IRQ_CL_F);
2889 hw->intr_mask &= ~IS_R1_F; 2891 hw->intr_mask &= ~(IS_R1_F|IS_XA1_F);
2890 netif_rx_schedule(hw->dev[0]); 2892 netif_rx_schedule(hw->dev[0]);
2891 } 2893 }
2892 2894
2893 if (status & IS_R2_F) { 2895 if (status & (IS_R2_F|IS_XA2_F)) {
2894 skge_write8(hw, Q_ADDR(Q_R2, Q_CSR), CSR_IRQ_CL_F); 2896 skge_write8(hw, Q_ADDR(Q_R2, Q_CSR), CSR_IRQ_CL_F);
2895 hw->intr_mask &= ~IS_R2_F; 2897 hw->intr_mask &= ~(IS_R2_F|IS_XA2_F);
2896 netif_rx_schedule(hw->dev[1]); 2898 netif_rx_schedule(hw->dev[1]);
2897 } 2899 }
2898 2900
2899 if (status & IS_XA1_F) 2901 if (likely((status & hw->intr_mask) == 0))
2900 skge_tx_intr(hw->dev[0]); 2902 return IRQ_HANDLED;
2901
2902 if (status & IS_XA2_F)
2903 skge_tx_intr(hw->dev[1]);
2904 2903
2905 if (status & IS_PA_TO_RX1) { 2904 if (status & IS_PA_TO_RX1) {
2906 struct skge_port *skge = netdev_priv(hw->dev[0]); 2905 struct skge_port *skge = netdev_priv(hw->dev[0]);
@@ -2929,13 +2928,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs)
2929 if (status & IS_HW_ERR) 2928 if (status & IS_HW_ERR)
2930 skge_error_irq(hw); 2929 skge_error_irq(hw);
2931 2930
2932 if (status & IS_EXT_REG) {
2933 hw->intr_mask &= ~IS_EXT_REG;
2934 tasklet_schedule(&hw->ext_tasklet);
2935 }
2936
2937 skge_write32(hw, B0_IMSK, hw->intr_mask); 2931 skge_write32(hw, B0_IMSK, hw->intr_mask);
2938 spin_unlock(&hw->hw_lock);
2939 2932
2940 return IRQ_HANDLED; 2933 return IRQ_HANDLED;
2941} 2934}
@@ -3010,7 +3003,7 @@ static const char *skge_board_name(const struct skge_hw *hw)
3010static int skge_reset(struct skge_hw *hw) 3003static int skge_reset(struct skge_hw *hw)
3011{ 3004{
3012 u32 reg; 3005 u32 reg;
3013 u16 ctst; 3006 u16 ctst, pci_status;
3014 u8 t8, mac_cfg, pmd_type, phy_type; 3007 u8 t8, mac_cfg, pmd_type, phy_type;
3015 int i; 3008 int i;
3016 3009
@@ -3021,8 +3014,13 @@ static int skge_reset(struct skge_hw *hw)
3021 skge_write8(hw, B0_CTST, CS_RST_CLR); 3014 skge_write8(hw, B0_CTST, CS_RST_CLR);
3022 3015
3023 /* clear PCI errors, if any */ 3016 /* clear PCI errors, if any */
3024 skge_pci_clear(hw); 3017 skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
3018 skge_write8(hw, B2_TST_CTRL2, 0);
3025 3019
3020 pci_read_config_word(hw->pdev, PCI_STATUS, &pci_status);
3021 pci_write_config_word(hw->pdev, PCI_STATUS,
3022 pci_status | PCI_STATUS_ERROR_BITS);
3023 skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
3026 skge_write8(hw, B0_CTST, CS_MRST_CLR); 3024 skge_write8(hw, B0_CTST, CS_MRST_CLR);
3027 3025
3028 /* restore CLK_RUN bits (for Yukon-Lite) */ 3026 /* restore CLK_RUN bits (for Yukon-Lite) */
@@ -3081,7 +3079,10 @@ static int skge_reset(struct skge_hw *hw)
3081 else 3079 else
3082 hw->ram_size = t8 * 4096; 3080 hw->ram_size = t8 * 4096;
3083 3081
3084 hw->intr_mask = IS_HW_ERR | IS_EXT_REG; 3082 hw->intr_mask = IS_HW_ERR | IS_EXT_REG | IS_PORT_1;
3083 if (hw->ports > 1)
3084 hw->intr_mask |= IS_PORT_2;
3085
3085 if (hw->chip_id == CHIP_ID_GENESIS) 3086 if (hw->chip_id == CHIP_ID_GENESIS)
3086 genesis_init(hw); 3087 genesis_init(hw);
3087 else { 3088 else {
@@ -3251,13 +3252,15 @@ static int __devinit skge_probe(struct pci_dev *pdev,
3251 struct skge_hw *hw; 3252 struct skge_hw *hw;
3252 int err, using_dac = 0; 3253 int err, using_dac = 0;
3253 3254
3254 if ((err = pci_enable_device(pdev))) { 3255 err = pci_enable_device(pdev);
3256 if (err) {
3255 printk(KERN_ERR PFX "%s cannot enable PCI device\n", 3257 printk(KERN_ERR PFX "%s cannot enable PCI device\n",
3256 pci_name(pdev)); 3258 pci_name(pdev));
3257 goto err_out; 3259 goto err_out;
3258 } 3260 }
3259 3261
3260 if ((err = pci_request_regions(pdev, DRV_NAME))) { 3262 err = pci_request_regions(pdev, DRV_NAME);
3263 if (err) {
3261 printk(KERN_ERR PFX "%s cannot obtain PCI resources\n", 3264 printk(KERN_ERR PFX "%s cannot obtain PCI resources\n",
3262 pci_name(pdev)); 3265 pci_name(pdev));
3263 goto err_out_disable_pdev; 3266 goto err_out_disable_pdev;
@@ -3265,22 +3268,18 @@ static int __devinit skge_probe(struct pci_dev *pdev,
3265 3268
3266 pci_set_master(pdev); 3269 pci_set_master(pdev);
3267 3270
3268 if (sizeof(dma_addr_t) > sizeof(u32) && 3271 if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
3269 !(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) {
3270 using_dac = 1; 3272 using_dac = 1;
3271 err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); 3273 err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
3272 if (err < 0) { 3274 } else if (!(err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
3273 printk(KERN_ERR PFX "%s unable to obtain 64 bit DMA " 3275 using_dac = 0;
3274 "for consistent allocations\n", pci_name(pdev)); 3276 err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
3275 goto err_out_free_regions; 3277 }
3276 } 3278
3277 } else { 3279 if (err) {
3278 err = pci_set_dma_mask(pdev, DMA_32BIT_MASK); 3280 printk(KERN_ERR PFX "%s no usable DMA configuration\n",
3279 if (err) { 3281 pci_name(pdev));
3280 printk(KERN_ERR PFX "%s no usable DMA configuration\n", 3282 goto err_out_free_regions;
3281 pci_name(pdev));
3282 goto err_out_free_regions;
3283 }
3284 } 3283 }
3285 3284
3286#ifdef __BIG_ENDIAN 3285#ifdef __BIG_ENDIAN
@@ -3304,7 +3303,6 @@ static int __devinit skge_probe(struct pci_dev *pdev,
3304 3303
3305 hw->pdev = pdev; 3304 hw->pdev = pdev;
3306 spin_lock_init(&hw->phy_lock); 3305 spin_lock_init(&hw->phy_lock);
3307 spin_lock_init(&hw->hw_lock);
3308 tasklet_init(&hw->ext_tasklet, skge_extirq, (unsigned long) hw); 3306 tasklet_init(&hw->ext_tasklet, skge_extirq, (unsigned long) hw);
3309 3307
3310 hw->regs = ioremap_nocache(pci_resource_start(pdev, 0), 0x4000); 3308 hw->regs = ioremap_nocache(pci_resource_start(pdev, 0), 0x4000);
@@ -3314,7 +3312,8 @@ static int __devinit skge_probe(struct pci_dev *pdev,
3314 goto err_out_free_hw; 3312 goto err_out_free_hw;
3315 } 3313 }
3316 3314
3317 if ((err = request_irq(pdev->irq, skge_intr, SA_SHIRQ, DRV_NAME, hw))) { 3315 err = request_irq(pdev->irq, skge_intr, SA_SHIRQ, DRV_NAME, hw);
3316 if (err) {
3318 printk(KERN_ERR PFX "%s: cannot assign irq %d\n", 3317 printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
3319 pci_name(pdev), pdev->irq); 3318 pci_name(pdev), pdev->irq);
3320 goto err_out_iounmap; 3319 goto err_out_iounmap;
@@ -3332,7 +3331,8 @@ static int __devinit skge_probe(struct pci_dev *pdev,
3332 if ((dev = skge_devinit(hw, 0, using_dac)) == NULL) 3331 if ((dev = skge_devinit(hw, 0, using_dac)) == NULL)
3333 goto err_out_led_off; 3332 goto err_out_led_off;
3334 3333
3335 if ((err = register_netdev(dev))) { 3334 err = register_netdev(dev);
3335 if (err) {
3336 printk(KERN_ERR PFX "%s: cannot register net device\n", 3336 printk(KERN_ERR PFX "%s: cannot register net device\n",
3337 pci_name(pdev)); 3337 pci_name(pdev));
3338 goto err_out_free_netdev; 3338 goto err_out_free_netdev;
@@ -3387,7 +3387,6 @@ static void __devexit skge_remove(struct pci_dev *pdev)
3387 3387
3388 skge_write32(hw, B0_IMSK, 0); 3388 skge_write32(hw, B0_IMSK, 0);
3389 skge_write16(hw, B0_LED, LED_STAT_OFF); 3389 skge_write16(hw, B0_LED, LED_STAT_OFF);
3390 skge_pci_clear(hw);
3391 skge_write8(hw, B0_CTST, CS_RST_SET); 3390 skge_write8(hw, B0_CTST, CS_RST_SET);
3392 3391
3393 tasklet_kill(&hw->ext_tasklet); 3392 tasklet_kill(&hw->ext_tasklet);
diff --git a/drivers/net/skge.h b/drivers/net/skge.h
index 941f12a333b6..2efdacc290e5 100644
--- a/drivers/net/skge.h
+++ b/drivers/net/skge.h
@@ -2402,7 +2402,6 @@ struct skge_hw {
2402 2402
2403 struct tasklet_struct ext_tasklet; 2403 struct tasklet_struct ext_tasklet;
2404 spinlock_t phy_lock; 2404 spinlock_t phy_lock;
2405 spinlock_t hw_lock;
2406}; 2405};
2407 2406
2408enum { 2407enum {
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 73260364cba3..f08fe6c884b2 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -51,7 +51,7 @@
51#include "sky2.h" 51#include "sky2.h"
52 52
53#define DRV_NAME "sky2" 53#define DRV_NAME "sky2"
54#define DRV_VERSION "0.15" 54#define DRV_VERSION "1.1"
55#define PFX DRV_NAME " " 55#define PFX DRV_NAME " "
56 56
57/* 57/*
@@ -61,10 +61,6 @@
61 * a receive requires one (or two if using 64 bit dma). 61 * a receive requires one (or two if using 64 bit dma).
62 */ 62 */
63 63
64#define is_ec_a1(hw) \
65 unlikely((hw)->chip_id == CHIP_ID_YUKON_EC && \
66 (hw)->chip_rev == CHIP_REV_YU_EC_A1)
67
68#define RX_LE_SIZE 512 64#define RX_LE_SIZE 512
69#define RX_LE_BYTES (RX_LE_SIZE*sizeof(struct sky2_rx_le)) 65#define RX_LE_BYTES (RX_LE_SIZE*sizeof(struct sky2_rx_le))
70#define RX_MAX_PENDING (RX_LE_SIZE/2 - 2) 66#define RX_MAX_PENDING (RX_LE_SIZE/2 - 2)
@@ -96,6 +92,10 @@ static int copybreak __read_mostly = 256;
96module_param(copybreak, int, 0); 92module_param(copybreak, int, 0);
97MODULE_PARM_DESC(copybreak, "Receive copy threshold"); 93MODULE_PARM_DESC(copybreak, "Receive copy threshold");
98 94
95static int disable_msi = 0;
96module_param(disable_msi, int, 0);
97MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");
98
99static const struct pci_device_id sky2_id_table[] = { 99static const struct pci_device_id sky2_id_table[] = {
100 { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, 100 { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) },
101 { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, 101 { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) },
@@ -504,9 +504,9 @@ static void sky2_phy_init(struct sky2_hw *hw, unsigned port)
504/* Force a renegotiation */ 504/* Force a renegotiation */
505static void sky2_phy_reinit(struct sky2_port *sky2) 505static void sky2_phy_reinit(struct sky2_port *sky2)
506{ 506{
507 down(&sky2->phy_sema); 507 spin_lock_bh(&sky2->phy_lock);
508 sky2_phy_init(sky2->hw, sky2->port); 508 sky2_phy_init(sky2->hw, sky2->port);
509 up(&sky2->phy_sema); 509 spin_unlock_bh(&sky2->phy_lock);
510} 510}
511 511
512static void sky2_mac_init(struct sky2_hw *hw, unsigned port) 512static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
@@ -571,9 +571,9 @@ static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
571 571
572 sky2_read16(hw, SK_REG(port, GMAC_IRQ_SRC)); 572 sky2_read16(hw, SK_REG(port, GMAC_IRQ_SRC));
573 573
574 down(&sky2->phy_sema); 574 spin_lock_bh(&sky2->phy_lock);
575 sky2_phy_init(hw, port); 575 sky2_phy_init(hw, port);
576 up(&sky2->phy_sema); 576 spin_unlock_bh(&sky2->phy_lock);
577 577
578 /* MIB clear */ 578 /* MIB clear */
579 reg = gma_read16(hw, port, GM_PHY_ADDR); 579 reg = gma_read16(hw, port, GM_PHY_ADDR);
@@ -725,37 +725,11 @@ static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2)
725 return le; 725 return le;
726} 726}
727 727
728/* 728/* Update chip's next pointer */
729 * This is a workaround code taken from SysKonnect sk98lin driver 729static inline void sky2_put_idx(struct sky2_hw *hw, unsigned q, u16 idx)
730 * to deal with chip bug on Yukon EC rev 0 in the wraparound case.
731 */
732static void sky2_put_idx(struct sky2_hw *hw, unsigned q,
733 u16 idx, u16 *last, u16 size)
734{ 730{
735 wmb(); 731 wmb();
736 if (is_ec_a1(hw) && idx < *last) { 732 sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx);
737 u16 hwget = sky2_read16(hw, Y2_QADDR(q, PREF_UNIT_GET_IDX));
738
739 if (hwget == 0) {
740 /* Start prefetching again */
741 sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 0xe0);
742 goto setnew;
743 }
744
745 if (hwget == size - 1) {
746 /* set watermark to one list element */
747 sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 8);
748
749 /* set put index to first list element */
750 sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), 0);
751 } else /* have hardware go to end of list */
752 sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX),
753 size - 1);
754 } else {
755setnew:
756 sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx);
757 }
758 *last = idx;
759 mmiowb(); 733 mmiowb();
760} 734}
761 735
@@ -878,7 +852,7 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
878 if (!netif_running(dev)) 852 if (!netif_running(dev))
879 return -ENODEV; /* Phy still in reset */ 853 return -ENODEV; /* Phy still in reset */
880 854
881 switch(cmd) { 855 switch (cmd) {
882 case SIOCGMIIPHY: 856 case SIOCGMIIPHY:
883 data->phy_id = PHY_ADDR_MARV; 857 data->phy_id = PHY_ADDR_MARV;
884 858
@@ -886,9 +860,9 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
886 case SIOCGMIIREG: { 860 case SIOCGMIIREG: {
887 u16 val = 0; 861 u16 val = 0;
888 862
889 down(&sky2->phy_sema); 863 spin_lock_bh(&sky2->phy_lock);
890 err = __gm_phy_read(hw, sky2->port, data->reg_num & 0x1f, &val); 864 err = __gm_phy_read(hw, sky2->port, data->reg_num & 0x1f, &val);
891 up(&sky2->phy_sema); 865 spin_unlock_bh(&sky2->phy_lock);
892 866
893 data->val_out = val; 867 data->val_out = val;
894 break; 868 break;
@@ -898,10 +872,10 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
898 if (!capable(CAP_NET_ADMIN)) 872 if (!capable(CAP_NET_ADMIN))
899 return -EPERM; 873 return -EPERM;
900 874
901 down(&sky2->phy_sema); 875 spin_lock_bh(&sky2->phy_lock);
902 err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f, 876 err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f,
903 data->val_in); 877 data->val_in);
904 up(&sky2->phy_sema); 878 spin_unlock_bh(&sky2->phy_lock);
905 break; 879 break;
906 } 880 }
907 return err; 881 return err;
@@ -1001,7 +975,6 @@ static int sky2_rx_start(struct sky2_port *sky2)
1001 975
1002 /* Tell chip about available buffers */ 976 /* Tell chip about available buffers */
1003 sky2_write16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX), sky2->rx_put); 977 sky2_write16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX), sky2->rx_put);
1004 sky2->rx_last_put = sky2_read16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX));
1005 return 0; 978 return 0;
1006nomem: 979nomem:
1007 sky2_rx_clean(sky2); 980 sky2_rx_clean(sky2);
@@ -1014,7 +987,7 @@ static int sky2_up(struct net_device *dev)
1014 struct sky2_port *sky2 = netdev_priv(dev); 987 struct sky2_port *sky2 = netdev_priv(dev);
1015 struct sky2_hw *hw = sky2->hw; 988 struct sky2_hw *hw = sky2->hw;
1016 unsigned port = sky2->port; 989 unsigned port = sky2->port;
1017 u32 ramsize, rxspace; 990 u32 ramsize, rxspace, imask;
1018 int err = -ENOMEM; 991 int err = -ENOMEM;
1019 992
1020 if (netif_msg_ifup(sky2)) 993 if (netif_msg_ifup(sky2))
@@ -1079,10 +1052,10 @@ static int sky2_up(struct net_device *dev)
1079 goto err_out; 1052 goto err_out;
1080 1053
1081 /* Enable interrupts from phy/mac for port */ 1054 /* Enable interrupts from phy/mac for port */
1082 spin_lock_irq(&hw->hw_lock); 1055 imask = sky2_read32(hw, B0_IMSK);
1083 hw->intr_mask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2; 1056 imask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
1084 sky2_write32(hw, B0_IMSK, hw->intr_mask); 1057 sky2_write32(hw, B0_IMSK, imask);
1085 spin_unlock_irq(&hw->hw_lock); 1058
1086 return 0; 1059 return 0;
1087 1060
1088err_out: 1061err_out:
@@ -1299,8 +1272,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
1299 netif_stop_queue(dev); 1272 netif_stop_queue(dev);
1300 } 1273 }
1301 1274
1302 sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod, 1275 sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod);
1303 &sky2->tx_last_put, TX_RING_SIZE);
1304 1276
1305out_unlock: 1277out_unlock:
1306 spin_unlock(&sky2->tx_lock); 1278 spin_unlock(&sky2->tx_lock);
@@ -1332,7 +1304,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
1332 struct tx_ring_info *re = sky2->tx_ring + put; 1304 struct tx_ring_info *re = sky2->tx_ring + put;
1333 struct sk_buff *skb = re->skb; 1305 struct sk_buff *skb = re->skb;
1334 1306
1335 nxt = re->idx; 1307 nxt = re->idx;
1336 BUG_ON(nxt >= TX_RING_SIZE); 1308 BUG_ON(nxt >= TX_RING_SIZE);
1337 prefetch(sky2->tx_ring + nxt); 1309 prefetch(sky2->tx_ring + nxt);
1338 1310
@@ -1348,7 +1320,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
1348 struct tx_ring_info *fre; 1320 struct tx_ring_info *fre;
1349 fre = sky2->tx_ring + (put + i + 1) % TX_RING_SIZE; 1321 fre = sky2->tx_ring + (put + i + 1) % TX_RING_SIZE;
1350 pci_unmap_page(pdev, pci_unmap_addr(fre, mapaddr), 1322 pci_unmap_page(pdev, pci_unmap_addr(fre, mapaddr),
1351 skb_shinfo(skb)->frags[i].size, 1323 skb_shinfo(skb)->frags[i].size,
1352 PCI_DMA_TODEVICE); 1324 PCI_DMA_TODEVICE);
1353 } 1325 }
1354 1326
@@ -1356,7 +1328,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
1356 } 1328 }
1357 1329
1358 sky2->tx_cons = put; 1330 sky2->tx_cons = put;
1359 if (netif_queue_stopped(dev) && tx_avail(sky2) > MAX_SKB_TX_LE) 1331 if (tx_avail(sky2) > MAX_SKB_TX_LE)
1360 netif_wake_queue(dev); 1332 netif_wake_queue(dev);
1361} 1333}
1362 1334
@@ -1375,6 +1347,7 @@ static int sky2_down(struct net_device *dev)
1375 struct sky2_hw *hw = sky2->hw; 1347 struct sky2_hw *hw = sky2->hw;
1376 unsigned port = sky2->port; 1348 unsigned port = sky2->port;
1377 u16 ctrl; 1349 u16 ctrl;
1350 u32 imask;
1378 1351
1379 /* Never really got started! */ 1352 /* Never really got started! */
1380 if (!sky2->tx_le) 1353 if (!sky2->tx_le)
@@ -1386,14 +1359,6 @@ static int sky2_down(struct net_device *dev)
1386 /* Stop more packets from being queued */ 1359 /* Stop more packets from being queued */
1387 netif_stop_queue(dev); 1360 netif_stop_queue(dev);
1388 1361
1389 /* Disable port IRQ */
1390 spin_lock_irq(&hw->hw_lock);
1391 hw->intr_mask &= ~((sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
1392 sky2_write32(hw, B0_IMSK, hw->intr_mask);
1393 spin_unlock_irq(&hw->hw_lock);
1394
1395 flush_scheduled_work();
1396
1397 sky2_phy_reset(hw, port); 1362 sky2_phy_reset(hw, port);
1398 1363
1399 /* Stop transmitter */ 1364 /* Stop transmitter */
@@ -1437,6 +1402,11 @@ static int sky2_down(struct net_device *dev)
1437 sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET); 1402 sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET);
1438 sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET); 1403 sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET);
1439 1404
1405 /* Disable port IRQ */
1406 imask = sky2_read32(hw, B0_IMSK);
1407 imask &= ~(sky2->port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
1408 sky2_write32(hw, B0_IMSK, imask);
1409
1440 /* turn off LED's */ 1410 /* turn off LED's */
1441 sky2_write16(hw, B0_Y2LED, LED_STAT_OFF); 1411 sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
1442 1412
@@ -1631,20 +1601,19 @@ static int sky2_autoneg_done(struct sky2_port *sky2, u16 aux)
1631 return 0; 1601 return 0;
1632} 1602}
1633 1603
1634/* 1604/* Interrupt from PHY */
1635 * Interrupt from PHY are handled outside of interrupt context 1605static void sky2_phy_intr(struct sky2_hw *hw, unsigned port)
1636 * because accessing phy registers requires spin wait which might
1637 * cause excess interrupt latency.
1638 */
1639static void sky2_phy_task(void *arg)
1640{ 1606{
1641 struct sky2_port *sky2 = arg; 1607 struct net_device *dev = hw->dev[port];
1642 struct sky2_hw *hw = sky2->hw; 1608 struct sky2_port *sky2 = netdev_priv(dev);
1643 u16 istatus, phystat; 1609 u16 istatus, phystat;
1644 1610
1645 down(&sky2->phy_sema); 1611 spin_lock(&sky2->phy_lock);
1646 istatus = gm_phy_read(hw, sky2->port, PHY_MARV_INT_STAT); 1612 istatus = gm_phy_read(hw, port, PHY_MARV_INT_STAT);
1647 phystat = gm_phy_read(hw, sky2->port, PHY_MARV_PHY_STAT); 1613 phystat = gm_phy_read(hw, port, PHY_MARV_PHY_STAT);
1614
1615 if (!netif_running(dev))
1616 goto out;
1648 1617
1649 if (netif_msg_intr(sky2)) 1618 if (netif_msg_intr(sky2))
1650 printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n", 1619 printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n",
@@ -1670,12 +1639,7 @@ static void sky2_phy_task(void *arg)
1670 sky2_link_down(sky2); 1639 sky2_link_down(sky2);
1671 } 1640 }
1672out: 1641out:
1673 up(&sky2->phy_sema); 1642 spin_unlock(&sky2->phy_lock);
1674
1675 spin_lock_irq(&hw->hw_lock);
1676 hw->intr_mask |= (sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2;
1677 sky2_write32(hw, B0_IMSK, hw->intr_mask);
1678 spin_unlock_irq(&hw->hw_lock);
1679} 1643}
1680 1644
1681 1645
@@ -1687,31 +1651,40 @@ static void sky2_tx_timeout(struct net_device *dev)
1687 struct sky2_port *sky2 = netdev_priv(dev); 1651 struct sky2_port *sky2 = netdev_priv(dev);
1688 struct sky2_hw *hw = sky2->hw; 1652 struct sky2_hw *hw = sky2->hw;
1689 unsigned txq = txqaddr[sky2->port]; 1653 unsigned txq = txqaddr[sky2->port];
1690 u16 ridx; 1654 u16 report, done;
1691
1692 /* Maybe we just missed an status interrupt */
1693 spin_lock(&sky2->tx_lock);
1694 ridx = sky2_read16(hw,
1695 sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX);
1696 sky2_tx_complete(sky2, ridx);
1697 spin_unlock(&sky2->tx_lock);
1698
1699 if (!netif_queue_stopped(dev)) {
1700 if (net_ratelimit())
1701 pr_info(PFX "transmit interrupt missed? recovered\n");
1702 return;
1703 }
1704 1655
1705 if (netif_msg_timer(sky2)) 1656 if (netif_msg_timer(sky2))
1706 printk(KERN_ERR PFX "%s: tx timeout\n", dev->name); 1657 printk(KERN_ERR PFX "%s: tx timeout\n", dev->name);
1707 1658
1708 sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP); 1659 report = sky2_read16(hw, sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX);
1709 sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET); 1660 done = sky2_read16(hw, Q_ADDR(txq, Q_DONE));
1710 1661
1711 sky2_tx_clean(sky2); 1662 printk(KERN_DEBUG PFX "%s: transmit ring %u .. %u report=%u done=%u\n",
1663 dev->name,
1664 sky2->tx_cons, sky2->tx_prod, report, done);
1712 1665
1713 sky2_qset(hw, txq); 1666 if (report != done) {
1714 sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1); 1667 printk(KERN_INFO PFX "status burst pending (irq moderation?)\n");
1668
1669 sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
1670 sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
1671 } else if (report != sky2->tx_cons) {
1672 printk(KERN_INFO PFX "status report lost?\n");
1673
1674 spin_lock_bh(&sky2->tx_lock);
1675 sky2_tx_complete(sky2, report);
1676 spin_unlock_bh(&sky2->tx_lock);
1677 } else {
1678 printk(KERN_INFO PFX "hardware hung? flushing\n");
1679
1680 sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP);
1681 sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
1682
1683 sky2_tx_clean(sky2);
1684
1685 sky2_qset(hw, txq);
1686 sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1);
1687 }
1715} 1688}
1716 1689
1717 1690
@@ -1730,6 +1703,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
1730 struct sky2_hw *hw = sky2->hw; 1703 struct sky2_hw *hw = sky2->hw;
1731 int err; 1704 int err;
1732 u16 ctl, mode; 1705 u16 ctl, mode;
1706 u32 imask;
1733 1707
1734 if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) 1708 if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
1735 return -EINVAL; 1709 return -EINVAL;
@@ -1742,12 +1716,15 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
1742 return 0; 1716 return 0;
1743 } 1717 }
1744 1718
1719 imask = sky2_read32(hw, B0_IMSK);
1745 sky2_write32(hw, B0_IMSK, 0); 1720 sky2_write32(hw, B0_IMSK, 0);
1746 1721
1747 dev->trans_start = jiffies; /* prevent tx timeout */ 1722 dev->trans_start = jiffies; /* prevent tx timeout */
1748 netif_stop_queue(dev); 1723 netif_stop_queue(dev);
1749 netif_poll_disable(hw->dev[0]); 1724 netif_poll_disable(hw->dev[0]);
1750 1725
1726 synchronize_irq(hw->pdev->irq);
1727
1751 ctl = gma_read16(hw, sky2->port, GM_GP_CTRL); 1728 ctl = gma_read16(hw, sky2->port, GM_GP_CTRL);
1752 gma_write16(hw, sky2->port, GM_GP_CTRL, ctl & ~GM_GPCR_RX_ENA); 1729 gma_write16(hw, sky2->port, GM_GP_CTRL, ctl & ~GM_GPCR_RX_ENA);
1753 sky2_rx_stop(sky2); 1730 sky2_rx_stop(sky2);
@@ -1766,7 +1743,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
1766 sky2_write8(hw, RB_ADDR(rxqaddr[sky2->port], RB_CTRL), RB_ENA_OP_MD); 1743 sky2_write8(hw, RB_ADDR(rxqaddr[sky2->port], RB_CTRL), RB_ENA_OP_MD);
1767 1744
1768 err = sky2_rx_start(sky2); 1745 err = sky2_rx_start(sky2);
1769 sky2_write32(hw, B0_IMSK, hw->intr_mask); 1746 sky2_write32(hw, B0_IMSK, imask);
1770 1747
1771 if (err) 1748 if (err)
1772 dev_close(dev); 1749 dev_close(dev);
@@ -1843,8 +1820,7 @@ resubmit:
1843 sky2_rx_add(sky2, re->mapaddr); 1820 sky2_rx_add(sky2, re->mapaddr);
1844 1821
1845 /* Tell receiver about new buffers. */ 1822 /* Tell receiver about new buffers. */
1846 sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put, 1823 sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put);
1847 &sky2->rx_last_put, RX_LE_SIZE);
1848 1824
1849 return skb; 1825 return skb;
1850 1826
@@ -1871,76 +1847,51 @@ error:
1871 goto resubmit; 1847 goto resubmit;
1872} 1848}
1873 1849
1874/* 1850/* Transmit complete */
1875 * Check for transmit complete 1851static inline void sky2_tx_done(struct net_device *dev, u16 last)
1876 */
1877#define TX_NO_STATUS 0xffff
1878
1879static void sky2_tx_check(struct sky2_hw *hw, int port, u16 last)
1880{ 1852{
1881 if (last != TX_NO_STATUS) { 1853 struct sky2_port *sky2 = netdev_priv(dev);
1882 struct net_device *dev = hw->dev[port];
1883 if (dev && netif_running(dev)) {
1884 struct sky2_port *sky2 = netdev_priv(dev);
1885 1854
1886 spin_lock(&sky2->tx_lock); 1855 if (netif_running(dev)) {
1887 sky2_tx_complete(sky2, last); 1856 spin_lock(&sky2->tx_lock);
1888 spin_unlock(&sky2->tx_lock); 1857 sky2_tx_complete(sky2, last);
1889 } 1858 spin_unlock(&sky2->tx_lock);
1890 } 1859 }
1891} 1860}
1892 1861
1893/* 1862/* Process status response ring */
1894 * Both ports share the same status interrupt, therefore there is only 1863static int sky2_status_intr(struct sky2_hw *hw, int to_do)
1895 * one poll routine.
1896 */
1897static int sky2_poll(struct net_device *dev0, int *budget)
1898{ 1864{
1899 struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw; 1865 int work_done = 0;
1900 unsigned int to_do = min(dev0->quota, *budget);
1901 unsigned int work_done = 0;
1902 u16 hwidx;
1903 u16 tx_done[2] = { TX_NO_STATUS, TX_NO_STATUS };
1904
1905 sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
1906
1907 /*
1908 * Kick the STAT_LEV_TIMER_CTRL timer.
1909 * This fixes my hangs on Yukon-EC (0xb6) rev 1.
1910 * The if clause is there to start the timer only if it has been
1911 * configured correctly and not been disabled via ethtool.
1912 */
1913 if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_START) {
1914 sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP);
1915 sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
1916 }
1917 1866
1918 hwidx = sky2_read16(hw, STAT_PUT_IDX);
1919 BUG_ON(hwidx >= STATUS_RING_SIZE);
1920 rmb(); 1867 rmb();
1921 1868
1922 while (hwidx != hw->st_idx) { 1869 for(;;) {
1923 struct sky2_status_le *le = hw->st_le + hw->st_idx; 1870 struct sky2_status_le *le = hw->st_le + hw->st_idx;
1924 struct net_device *dev; 1871 struct net_device *dev;
1925 struct sky2_port *sky2; 1872 struct sky2_port *sky2;
1926 struct sk_buff *skb; 1873 struct sk_buff *skb;
1927 u32 status; 1874 u32 status;
1928 u16 length; 1875 u16 length;
1876 u8 link, opcode;
1877
1878 opcode = le->opcode;
1879 if (!opcode)
1880 break;
1881 opcode &= ~HW_OWNER;
1929 1882
1930 le = hw->st_le + hw->st_idx;
1931 hw->st_idx = (hw->st_idx + 1) % STATUS_RING_SIZE; 1883 hw->st_idx = (hw->st_idx + 1) % STATUS_RING_SIZE;
1932 prefetch(hw->st_le + hw->st_idx); 1884 le->opcode = 0;
1933 1885
1934 BUG_ON(le->link >= 2); 1886 link = le->link;
1935 dev = hw->dev[le->link]; 1887 BUG_ON(link >= 2);
1936 if (dev == NULL || !netif_running(dev)) 1888 dev = hw->dev[link];
1937 continue;
1938 1889
1939 sky2 = netdev_priv(dev); 1890 sky2 = netdev_priv(dev);
1940 status = le32_to_cpu(le->status); 1891 length = le->length;
1941 length = le16_to_cpu(le->length); 1892 status = le->status;
1942 1893
1943 switch (le->opcode & ~HW_OWNER) { 1894 switch (opcode) {
1944 case OP_RXSTAT: 1895 case OP_RXSTAT:
1945 skb = sky2_receive(sky2, length, status); 1896 skb = sky2_receive(sky2, length, status);
1946 if (!skb) 1897 if (!skb)
@@ -1980,42 +1931,23 @@ static int sky2_poll(struct net_device *dev0, int *budget)
1980 1931
1981 case OP_TXINDEXLE: 1932 case OP_TXINDEXLE:
1982 /* TX index reports status for both ports */ 1933 /* TX index reports status for both ports */
1983 tx_done[0] = status & 0xffff; 1934 sky2_tx_done(hw->dev[0], status & 0xffff);
1984 tx_done[1] = ((status >> 24) & 0xff) 1935 if (hw->dev[1])
1985 | (u16)(length & 0xf) << 8; 1936 sky2_tx_done(hw->dev[1],
1937 ((status >> 24) & 0xff)
1938 | (u16)(length & 0xf) << 8);
1986 break; 1939 break;
1987 1940
1988 default: 1941 default:
1989 if (net_ratelimit()) 1942 if (net_ratelimit())
1990 printk(KERN_WARNING PFX 1943 printk(KERN_WARNING PFX
1991 "unknown status opcode 0x%x\n", le->opcode); 1944 "unknown status opcode 0x%x\n", opcode);
1992 break; 1945 break;
1993 } 1946 }
1994 } 1947 }
1995 1948
1996exit_loop: 1949exit_loop:
1997 sky2_tx_check(hw, 0, tx_done[0]); 1950 return work_done;
1998 sky2_tx_check(hw, 1, tx_done[1]);
1999
2000 if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
2001 sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
2002 sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
2003 }
2004
2005 if (likely(work_done < to_do)) {
2006 spin_lock_irq(&hw->hw_lock);
2007 __netif_rx_complete(dev0);
2008
2009 hw->intr_mask |= Y2_IS_STAT_BMU;
2010 sky2_write32(hw, B0_IMSK, hw->intr_mask);
2011 spin_unlock_irq(&hw->hw_lock);
2012
2013 return 0;
2014 } else {
2015 *budget -= work_done;
2016 dev0->quota -= work_done;
2017 return 1;
2018 }
2019} 1951}
2020 1952
2021static void sky2_hw_error(struct sky2_hw *hw, unsigned port, u32 status) 1953static void sky2_hw_error(struct sky2_hw *hw, unsigned port, u32 status)
@@ -2134,57 +2066,97 @@ static void sky2_mac_intr(struct sky2_hw *hw, unsigned port)
2134 } 2066 }
2135} 2067}
2136 2068
2137static void sky2_phy_intr(struct sky2_hw *hw, unsigned port) 2069/* This should never happen it is a fatal situation */
2070static void sky2_descriptor_error(struct sky2_hw *hw, unsigned port,
2071 const char *rxtx, u32 mask)
2138{ 2072{
2139 struct net_device *dev = hw->dev[port]; 2073 struct net_device *dev = hw->dev[port];
2140 struct sky2_port *sky2 = netdev_priv(dev); 2074 struct sky2_port *sky2 = netdev_priv(dev);
2075 u32 imask;
2076
2077 printk(KERN_ERR PFX "%s: %s descriptor error (hardware problem)\n",
2078 dev ? dev->name : "<not registered>", rxtx);
2141 2079
2142 hw->intr_mask &= ~(port == 0 ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2); 2080 imask = sky2_read32(hw, B0_IMSK);
2143 sky2_write32(hw, B0_IMSK, hw->intr_mask); 2081 imask &= ~mask;
2082 sky2_write32(hw, B0_IMSK, imask);
2144 2083
2145 schedule_work(&sky2->phy_task); 2084 if (dev) {
2085 spin_lock(&sky2->phy_lock);
2086 sky2_link_down(sky2);
2087 spin_unlock(&sky2->phy_lock);
2088 }
2146} 2089}
2147 2090
2148static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs) 2091static int sky2_poll(struct net_device *dev0, int *budget)
2149{ 2092{
2150 struct sky2_hw *hw = dev_id; 2093 struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
2151 struct net_device *dev0 = hw->dev[0]; 2094 int work_limit = min(dev0->quota, *budget);
2152 u32 status; 2095 int work_done = 0;
2096 u32 status = sky2_read32(hw, B0_Y2_SP_EISR);
2153 2097
2154 status = sky2_read32(hw, B0_Y2_SP_ISRC2); 2098 if (unlikely(status & ~Y2_IS_STAT_BMU)) {
2155 if (status == 0 || status == ~0) 2099 if (status & Y2_IS_HW_ERR)
2156 return IRQ_NONE; 2100 sky2_hw_intr(hw);
2157 2101
2158 spin_lock(&hw->hw_lock); 2102 if (status & Y2_IS_IRQ_PHY1)
2159 if (status & Y2_IS_HW_ERR) 2103 sky2_phy_intr(hw, 0);
2160 sky2_hw_intr(hw);
2161 2104
2162 /* Do NAPI for Rx and Tx status */ 2105 if (status & Y2_IS_IRQ_PHY2)
2163 if (status & Y2_IS_STAT_BMU) { 2106 sky2_phy_intr(hw, 1);
2164 hw->intr_mask &= ~Y2_IS_STAT_BMU;
2165 sky2_write32(hw, B0_IMSK, hw->intr_mask);
2166 2107
2167 if (likely(__netif_rx_schedule_prep(dev0))) { 2108 if (status & Y2_IS_IRQ_MAC1)
2168 prefetch(&hw->st_le[hw->st_idx]); 2109 sky2_mac_intr(hw, 0);
2169 __netif_rx_schedule(dev0); 2110
2170 } 2111 if (status & Y2_IS_IRQ_MAC2)
2112 sky2_mac_intr(hw, 1);
2113
2114 if (status & Y2_IS_CHK_RX1)
2115 sky2_descriptor_error(hw, 0, "receive", Y2_IS_CHK_RX1);
2116
2117 if (status & Y2_IS_CHK_RX2)
2118 sky2_descriptor_error(hw, 1, "receive", Y2_IS_CHK_RX2);
2119
2120 if (status & Y2_IS_CHK_TXA1)
2121 sky2_descriptor_error(hw, 0, "transmit", Y2_IS_CHK_TXA1);
2122
2123 if (status & Y2_IS_CHK_TXA2)
2124 sky2_descriptor_error(hw, 1, "transmit", Y2_IS_CHK_TXA2);
2171 } 2125 }
2172 2126
2173 if (status & Y2_IS_IRQ_PHY1) 2127 if (status & Y2_IS_STAT_BMU) {
2174 sky2_phy_intr(hw, 0); 2128 work_done = sky2_status_intr(hw, work_limit);
2129 *budget -= work_done;
2130 dev0->quota -= work_done;
2131
2132 if (work_done >= work_limit)
2133 return 1;
2175 2134
2176 if (status & Y2_IS_IRQ_PHY2) 2135 sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
2177 sky2_phy_intr(hw, 1); 2136 }
2178 2137
2179 if (status & Y2_IS_IRQ_MAC1) 2138 netif_rx_complete(dev0);
2180 sky2_mac_intr(hw, 0);
2181 2139
2182 if (status & Y2_IS_IRQ_MAC2) 2140 status = sky2_read32(hw, B0_Y2_SP_LISR);
2183 sky2_mac_intr(hw, 1); 2141 return 0;
2142}
2184 2143
2185 sky2_write32(hw, B0_Y2_SP_ICR, 2); 2144static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
2145{
2146 struct sky2_hw *hw = dev_id;
2147 struct net_device *dev0 = hw->dev[0];
2148 u32 status;
2186 2149
2187 spin_unlock(&hw->hw_lock); 2150 /* Reading this mask interrupts as side effect */
2151 status = sky2_read32(hw, B0_Y2_SP_ISRC2);
2152 if (status == 0 || status == ~0)
2153 return IRQ_NONE;
2154
2155 prefetch(&hw->st_le[hw->st_idx]);
2156 if (likely(__netif_rx_schedule_prep(dev0)))
2157 __netif_rx_schedule(dev0);
2158 else
2159 printk(KERN_DEBUG PFX "irq race detected\n");
2188 2160
2189 return IRQ_HANDLED; 2161 return IRQ_HANDLED;
2190} 2162}
@@ -2238,6 +2210,23 @@ static int sky2_reset(struct sky2_hw *hw)
2238 return -EOPNOTSUPP; 2210 return -EOPNOTSUPP;
2239 } 2211 }
2240 2212
2213 hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4;
2214
2215 /* This rev is really old, and requires untested workarounds */
2216 if (hw->chip_id == CHIP_ID_YUKON_EC && hw->chip_rev == CHIP_REV_YU_EC_A1) {
2217 printk(KERN_ERR PFX "%s: unsupported revision Yukon-%s (0x%x) rev %d\n",
2218 pci_name(hw->pdev), yukon2_name[hw->chip_id - CHIP_ID_YUKON_XL],
2219 hw->chip_id, hw->chip_rev);
2220 return -EOPNOTSUPP;
2221 }
2222
2223 /* This chip is new and not tested yet */
2224 if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
2225 pr_info(PFX "%s: is a version of Yukon 2 chipset that has not been tested yet.\n",
2226 pci_name(hw->pdev));
2227 pr_info("Please report success/failure to maintainer <shemminger@osdl.org>\n");
2228 }
2229
2241 /* disable ASF */ 2230 /* disable ASF */
2242 if (hw->chip_id <= CHIP_ID_YUKON_EC) { 2231 if (hw->chip_id <= CHIP_ID_YUKON_EC) {
2243 sky2_write8(hw, B28_Y2_ASF_STAT_CMD, Y2_ASF_RESET); 2232 sky2_write8(hw, B28_Y2_ASF_STAT_CMD, Y2_ASF_RESET);
@@ -2258,7 +2247,7 @@ static int sky2_reset(struct sky2_hw *hw)
2258 sky2_write8(hw, B0_CTST, CS_MRST_CLR); 2247 sky2_write8(hw, B0_CTST, CS_MRST_CLR);
2259 2248
2260 /* clear any PEX errors */ 2249 /* clear any PEX errors */
2261 if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP)) 2250 if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP))
2262 sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL); 2251 sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL);
2263 2252
2264 2253
@@ -2271,7 +2260,6 @@ static int sky2_reset(struct sky2_hw *hw)
2271 if (!(sky2_read8(hw, B2_Y2_CLK_GATE) & Y2_STATUS_LNK2_INAC)) 2260 if (!(sky2_read8(hw, B2_Y2_CLK_GATE) & Y2_STATUS_LNK2_INAC))
2272 ++hw->ports; 2261 ++hw->ports;
2273 } 2262 }
2274 hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4;
2275 2263
2276 sky2_set_power_state(hw, PCI_D0); 2264 sky2_set_power_state(hw, PCI_D0);
2277 2265
@@ -2337,30 +2325,18 @@ static int sky2_reset(struct sky2_hw *hw)
2337 /* Set the list last index */ 2325 /* Set the list last index */
2338 sky2_write16(hw, STAT_LAST_IDX, STATUS_RING_SIZE - 1); 2326 sky2_write16(hw, STAT_LAST_IDX, STATUS_RING_SIZE - 1);
2339 2327
2340 /* These status setup values are copied from SysKonnect's driver */ 2328 sky2_write16(hw, STAT_TX_IDX_TH, 10);
2341 if (is_ec_a1(hw)) { 2329 sky2_write8(hw, STAT_FIFO_WM, 16);
2342 /* WA for dev. #4.3 */
2343 sky2_write16(hw, STAT_TX_IDX_TH, 0xfff); /* Tx Threshold */
2344
2345 /* set Status-FIFO watermark */
2346 sky2_write8(hw, STAT_FIFO_WM, 0x21); /* WA for dev. #4.18 */
2347 2330
2348 /* set Status-FIFO ISR watermark */ 2331 /* set Status-FIFO ISR watermark */
2349 sky2_write8(hw, STAT_FIFO_ISR_WM, 0x07); /* WA for dev. #4.18 */ 2332 if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0)
2350 sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 10000)); 2333 sky2_write8(hw, STAT_FIFO_ISR_WM, 4);
2351 } else { 2334 else
2352 sky2_write16(hw, STAT_TX_IDX_TH, 10); 2335 sky2_write8(hw, STAT_FIFO_ISR_WM, 16);
2353 sky2_write8(hw, STAT_FIFO_WM, 16);
2354
2355 /* set Status-FIFO ISR watermark */
2356 if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0)
2357 sky2_write8(hw, STAT_FIFO_ISR_WM, 4);
2358 else
2359 sky2_write8(hw, STAT_FIFO_ISR_WM, 16);
2360 2336
2361 sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000)); 2337 sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000));
2362 sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 7)); 2338 sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 20));
2363 } 2339 sky2_write32(hw, STAT_LEV_TIMER_INI, sky2_us2clk(hw, 100));
2364 2340
2365 /* enable status unit */ 2341 /* enable status unit */
2366 sky2_write32(hw, STAT_CTRL, SC_STAT_OP_ON); 2342 sky2_write32(hw, STAT_CTRL, SC_STAT_OP_ON);
@@ -2743,7 +2719,7 @@ static int sky2_phys_id(struct net_device *dev, u32 data)
2743 ms = data * 1000; 2719 ms = data * 1000;
2744 2720
2745 /* save initial values */ 2721 /* save initial values */
2746 down(&sky2->phy_sema); 2722 spin_lock_bh(&sky2->phy_lock);
2747 if (hw->chip_id == CHIP_ID_YUKON_XL) { 2723 if (hw->chip_id == CHIP_ID_YUKON_XL) {
2748 u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR); 2724 u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
2749 gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3); 2725 gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
@@ -2759,9 +2735,9 @@ static int sky2_phys_id(struct net_device *dev, u32 data)
2759 sky2_led(hw, port, onoff); 2735 sky2_led(hw, port, onoff);
2760 onoff = !onoff; 2736 onoff = !onoff;
2761 2737
2762 up(&sky2->phy_sema); 2738 spin_unlock_bh(&sky2->phy_lock);
2763 interrupted = msleep_interruptible(250); 2739 interrupted = msleep_interruptible(250);
2764 down(&sky2->phy_sema); 2740 spin_lock_bh(&sky2->phy_lock);
2765 2741
2766 ms -= 250; 2742 ms -= 250;
2767 } 2743 }
@@ -2776,7 +2752,7 @@ static int sky2_phys_id(struct net_device *dev, u32 data)
2776 gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl); 2752 gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl);
2777 gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover); 2753 gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover);
2778 } 2754 }
2779 up(&sky2->phy_sema); 2755 spin_unlock_bh(&sky2->phy_lock);
2780 2756
2781 return 0; 2757 return 0;
2782} 2758}
@@ -2806,38 +2782,6 @@ static int sky2_set_pauseparam(struct net_device *dev,
2806 return err; 2782 return err;
2807} 2783}
2808 2784
2809#ifdef CONFIG_PM
2810static void sky2_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
2811{
2812 struct sky2_port *sky2 = netdev_priv(dev);
2813
2814 wol->supported = WAKE_MAGIC;
2815 wol->wolopts = sky2->wol ? WAKE_MAGIC : 0;
2816}
2817
2818static int sky2_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
2819{
2820 struct sky2_port *sky2 = netdev_priv(dev);
2821 struct sky2_hw *hw = sky2->hw;
2822
2823 if (wol->wolopts != WAKE_MAGIC && wol->wolopts != 0)
2824 return -EOPNOTSUPP;
2825
2826 sky2->wol = wol->wolopts == WAKE_MAGIC;
2827
2828 if (sky2->wol) {
2829 memcpy_toio(hw->regs + WOL_MAC_ADDR, dev->dev_addr, ETH_ALEN);
2830
2831 sky2_write16(hw, WOL_CTRL_STAT,
2832 WOL_CTL_ENA_PME_ON_MAGIC_PKT |
2833 WOL_CTL_ENA_MAGIC_PKT_UNIT);
2834 } else
2835 sky2_write16(hw, WOL_CTRL_STAT, WOL_CTL_DEFAULT);
2836
2837 return 0;
2838}
2839#endif
2840
2841static int sky2_get_coalesce(struct net_device *dev, 2785static int sky2_get_coalesce(struct net_device *dev,
2842 struct ethtool_coalesce *ecmd) 2786 struct ethtool_coalesce *ecmd)
2843{ 2787{
@@ -2878,19 +2822,11 @@ static int sky2_set_coalesce(struct net_device *dev,
2878{ 2822{
2879 struct sky2_port *sky2 = netdev_priv(dev); 2823 struct sky2_port *sky2 = netdev_priv(dev);
2880 struct sky2_hw *hw = sky2->hw; 2824 struct sky2_hw *hw = sky2->hw;
2881 const u32 tmin = sky2_clk2us(hw, 1); 2825 const u32 tmax = sky2_clk2us(hw, 0x0ffffff);
2882 const u32 tmax = 5000;
2883
2884 if (ecmd->tx_coalesce_usecs != 0 &&
2885 (ecmd->tx_coalesce_usecs < tmin || ecmd->tx_coalesce_usecs > tmax))
2886 return -EINVAL;
2887
2888 if (ecmd->rx_coalesce_usecs != 0 &&
2889 (ecmd->rx_coalesce_usecs < tmin || ecmd->rx_coalesce_usecs > tmax))
2890 return -EINVAL;
2891 2826
2892 if (ecmd->rx_coalesce_usecs_irq != 0 && 2827 if (ecmd->tx_coalesce_usecs > tmax ||
2893 (ecmd->rx_coalesce_usecs_irq < tmin || ecmd->rx_coalesce_usecs_irq > tmax)) 2828 ecmd->rx_coalesce_usecs > tmax ||
2829 ecmd->rx_coalesce_usecs_irq > tmax)
2894 return -EINVAL; 2830 return -EINVAL;
2895 2831
2896 if (ecmd->tx_max_coalesced_frames >= TX_RING_SIZE-1) 2832 if (ecmd->tx_max_coalesced_frames >= TX_RING_SIZE-1)
@@ -3025,10 +2961,6 @@ static struct ethtool_ops sky2_ethtool_ops = {
3025 .set_ringparam = sky2_set_ringparam, 2961 .set_ringparam = sky2_set_ringparam,
3026 .get_pauseparam = sky2_get_pauseparam, 2962 .get_pauseparam = sky2_get_pauseparam,
3027 .set_pauseparam = sky2_set_pauseparam, 2963 .set_pauseparam = sky2_set_pauseparam,
3028#ifdef CONFIG_PM
3029 .get_wol = sky2_get_wol,
3030 .set_wol = sky2_set_wol,
3031#endif
3032 .phys_id = sky2_phys_id, 2964 .phys_id = sky2_phys_id,
3033 .get_stats_count = sky2_get_stats_count, 2965 .get_stats_count = sky2_get_stats_count,
3034 .get_ethtool_stats = sky2_get_ethtool_stats, 2966 .get_ethtool_stats = sky2_get_ethtool_stats,
@@ -3082,16 +3014,15 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw,
3082 sky2->speed = -1; 3014 sky2->speed = -1;
3083 sky2->advertising = sky2_supported_modes(hw); 3015 sky2->advertising = sky2_supported_modes(hw);
3084 3016
3085 /* Receive checksum disabled for Yukon XL 3017 /* Receive checksum disabled for Yukon XL
3086 * because of observed problems with incorrect 3018 * because of observed problems with incorrect
3087 * values when multiple packets are received in one interrupt 3019 * values when multiple packets are received in one interrupt
3088 */ 3020 */
3089 sky2->rx_csum = (hw->chip_id != CHIP_ID_YUKON_XL); 3021 sky2->rx_csum = (hw->chip_id != CHIP_ID_YUKON_XL);
3090 3022
3091 INIT_WORK(&sky2->phy_task, sky2_phy_task, sky2); 3023 spin_lock_init(&sky2->phy_lock);
3092 init_MUTEX(&sky2->phy_sema);
3093 sky2->tx_pending = TX_DEF_PENDING; 3024 sky2->tx_pending = TX_DEF_PENDING;
3094 sky2->rx_pending = is_ec_a1(hw) ? 8 : RX_DEF_PENDING; 3025 sky2->rx_pending = RX_DEF_PENDING;
3095 sky2->rx_bufsize = sky2_buf_size(ETH_DATA_LEN); 3026 sky2->rx_bufsize = sky2_buf_size(ETH_DATA_LEN);
3096 3027
3097 hw->dev[port] = dev; 3028 hw->dev[port] = dev;
@@ -3133,6 +3064,66 @@ static void __devinit sky2_show_addr(struct net_device *dev)
3133 dev->dev_addr[3], dev->dev_addr[4], dev->dev_addr[5]); 3064 dev->dev_addr[3], dev->dev_addr[4], dev->dev_addr[5]);
3134} 3065}
3135 3066
3067/* Handle software interrupt used during MSI test */
3068static irqreturn_t __devinit sky2_test_intr(int irq, void *dev_id,
3069 struct pt_regs *regs)
3070{
3071 struct sky2_hw *hw = dev_id;
3072 u32 status = sky2_read32(hw, B0_Y2_SP_ISRC2);
3073
3074 if (status == 0)
3075 return IRQ_NONE;
3076
3077 if (status & Y2_IS_IRQ_SW) {
3078 hw->msi_detected = 1;
3079 wake_up(&hw->msi_wait);
3080 sky2_write8(hw, B0_CTST, CS_CL_SW_IRQ);
3081 }
3082 sky2_write32(hw, B0_Y2_SP_ICR, 2);
3083
3084 return IRQ_HANDLED;
3085}
3086
3087/* Test interrupt path by forcing a a software IRQ */
3088static int __devinit sky2_test_msi(struct sky2_hw *hw)
3089{
3090 struct pci_dev *pdev = hw->pdev;
3091 int err;
3092
3093 sky2_write32(hw, B0_IMSK, Y2_IS_IRQ_SW);
3094
3095 err = request_irq(pdev->irq, sky2_test_intr, SA_SHIRQ, DRV_NAME, hw);
3096 if (err) {
3097 printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
3098 pci_name(pdev), pdev->irq);
3099 return err;
3100 }
3101
3102 init_waitqueue_head (&hw->msi_wait);
3103
3104 sky2_write8(hw, B0_CTST, CS_ST_SW_IRQ);
3105 wmb();
3106
3107 wait_event_timeout(hw->msi_wait, hw->msi_detected, HZ/10);
3108
3109 if (!hw->msi_detected) {
3110 /* MSI test failed, go back to INTx mode */
3111 printk(KERN_WARNING PFX "%s: No interrupt was generated using MSI, "
3112 "switching to INTx mode. Please report this failure to "
3113 "the PCI maintainer and include system chipset information.\n",
3114 pci_name(pdev));
3115
3116 err = -EOPNOTSUPP;
3117 sky2_write8(hw, B0_CTST, CS_CL_SW_IRQ);
3118 }
3119
3120 sky2_write32(hw, B0_IMSK, 0);
3121
3122 free_irq(pdev->irq, hw);
3123
3124 return err;
3125}
3126
3136static int __devinit sky2_probe(struct pci_dev *pdev, 3127static int __devinit sky2_probe(struct pci_dev *pdev,
3137 const struct pci_device_id *ent) 3128 const struct pci_device_id *ent)
3138{ 3129{
@@ -3201,7 +3192,6 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
3201 goto err_out_free_hw; 3192 goto err_out_free_hw;
3202 } 3193 }
3203 hw->pm_cap = pm_cap; 3194 hw->pm_cap = pm_cap;
3204 spin_lock_init(&hw->hw_lock);
3205 3195
3206#ifdef __BIG_ENDIAN 3196#ifdef __BIG_ENDIAN
3207 /* byte swap descriptors in hardware */ 3197 /* byte swap descriptors in hardware */
@@ -3254,21 +3244,29 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
3254 } 3244 }
3255 } 3245 }
3256 3246
3257 err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw); 3247 if (!disable_msi && pci_enable_msi(pdev) == 0) {
3248 err = sky2_test_msi(hw);
3249 if (err == -EOPNOTSUPP)
3250 pci_disable_msi(pdev);
3251 else if (err)
3252 goto err_out_unregister;
3253 }
3254
3255 err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw);
3258 if (err) { 3256 if (err) {
3259 printk(KERN_ERR PFX "%s: cannot assign irq %d\n", 3257 printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
3260 pci_name(pdev), pdev->irq); 3258 pci_name(pdev), pdev->irq);
3261 goto err_out_unregister; 3259 goto err_out_unregister;
3262 } 3260 }
3263 3261
3264 hw->intr_mask = Y2_IS_BASE; 3262 sky2_write32(hw, B0_IMSK, Y2_IS_BASE);
3265 sky2_write32(hw, B0_IMSK, hw->intr_mask);
3266 3263
3267 pci_set_drvdata(pdev, hw); 3264 pci_set_drvdata(pdev, hw);
3268 3265
3269 return 0; 3266 return 0;
3270 3267
3271err_out_unregister: 3268err_out_unregister:
3269 pci_disable_msi(pdev);
3272 if (dev1) { 3270 if (dev1) {
3273 unregister_netdev(dev1); 3271 unregister_netdev(dev1);
3274 free_netdev(dev1); 3272 free_netdev(dev1);
@@ -3311,6 +3309,7 @@ static void __devexit sky2_remove(struct pci_dev *pdev)
3311 sky2_read8(hw, B0_CTST); 3309 sky2_read8(hw, B0_CTST);
3312 3310
3313 free_irq(pdev->irq, hw); 3311 free_irq(pdev->irq, hw);
3312 pci_disable_msi(pdev);
3314 pci_free_consistent(pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma); 3313 pci_free_consistent(pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma);
3315 pci_release_regions(pdev); 3314 pci_release_regions(pdev);
3316 pci_disable_device(pdev); 3315 pci_disable_device(pdev);
diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
index dce955c76f3c..d63cd5a1b71c 100644
--- a/drivers/net/sky2.h
+++ b/drivers/net/sky2.h
@@ -278,13 +278,11 @@ enum {
278 Y2_IS_CHK_TXS1 = 1<<1, /* Descriptor error TXS 1 */ 278 Y2_IS_CHK_TXS1 = 1<<1, /* Descriptor error TXS 1 */
279 Y2_IS_CHK_TXA1 = 1<<0, /* Descriptor error TXA 1 */ 279 Y2_IS_CHK_TXA1 = 1<<0, /* Descriptor error TXA 1 */
280 280
281 Y2_IS_BASE = Y2_IS_HW_ERR | Y2_IS_STAT_BMU | 281 Y2_IS_BASE = Y2_IS_HW_ERR | Y2_IS_STAT_BMU,
282 Y2_IS_POLL_CHK | Y2_IS_TWSI_RDY | 282 Y2_IS_PORT_1 = Y2_IS_IRQ_PHY1 | Y2_IS_IRQ_MAC1
283 Y2_IS_IRQ_SW | Y2_IS_TIMINT, 283 | Y2_IS_CHK_TXA1 | Y2_IS_CHK_RX1,
284 Y2_IS_PORT_1 = Y2_IS_IRQ_PHY1 | Y2_IS_IRQ_MAC1 | 284 Y2_IS_PORT_2 = Y2_IS_IRQ_PHY2 | Y2_IS_IRQ_MAC2
285 Y2_IS_CHK_RX1 | Y2_IS_CHK_TXA1 | Y2_IS_CHK_TXS1, 285 | Y2_IS_CHK_TXA2 | Y2_IS_CHK_RX2,
286 Y2_IS_PORT_2 = Y2_IS_IRQ_PHY2 | Y2_IS_IRQ_MAC2 |
287 Y2_IS_CHK_RX2 | Y2_IS_CHK_TXA2 | Y2_IS_CHK_TXS2,
288}; 286};
289 287
290/* B2_IRQM_HWE_MSK 32 bit IRQ Moderation HW Error Mask */ 288/* B2_IRQM_HWE_MSK 32 bit IRQ Moderation HW Error Mask */
@@ -1832,6 +1830,7 @@ struct sky2_port {
1832 struct net_device *netdev; 1830 struct net_device *netdev;
1833 unsigned port; 1831 unsigned port;
1834 u32 msg_enable; 1832 u32 msg_enable;
1833 spinlock_t phy_lock;
1835 1834
1836 spinlock_t tx_lock ____cacheline_aligned_in_smp; 1835 spinlock_t tx_lock ____cacheline_aligned_in_smp;
1837 struct tx_ring_info *tx_ring; 1836 struct tx_ring_info *tx_ring;
@@ -1840,7 +1839,6 @@ struct sky2_port {
1840 u16 tx_prod; /* next le to use */ 1839 u16 tx_prod; /* next le to use */
1841 u32 tx_addr64; 1840 u32 tx_addr64;
1842 u16 tx_pending; 1841 u16 tx_pending;
1843 u16 tx_last_put;
1844 u16 tx_last_mss; 1842 u16 tx_last_mss;
1845 1843
1846 struct ring_info *rx_ring ____cacheline_aligned_in_smp; 1844 struct ring_info *rx_ring ____cacheline_aligned_in_smp;
@@ -1849,7 +1847,6 @@ struct sky2_port {
1849 u16 rx_next; /* next re to check */ 1847 u16 rx_next; /* next re to check */
1850 u16 rx_put; /* next le index to use */ 1848 u16 rx_put; /* next le index to use */
1851 u16 rx_pending; 1849 u16 rx_pending;
1852 u16 rx_last_put;
1853 u16 rx_bufsize; 1850 u16 rx_bufsize;
1854#ifdef SKY2_VLAN_TAG_USED 1851#ifdef SKY2_VLAN_TAG_USED
1855 u16 rx_tag; 1852 u16 rx_tag;
@@ -1865,20 +1862,15 @@ struct sky2_port {
1865 u8 rx_pause; 1862 u8 rx_pause;
1866 u8 tx_pause; 1863 u8 tx_pause;
1867 u8 rx_csum; 1864 u8 rx_csum;
1868 u8 wol;
1869 1865
1870 struct net_device_stats net_stats; 1866 struct net_device_stats net_stats;
1871 1867
1872 struct work_struct phy_task;
1873 struct semaphore phy_sema;
1874}; 1868};
1875 1869
1876struct sky2_hw { 1870struct sky2_hw {
1877 void __iomem *regs; 1871 void __iomem *regs;
1878 struct pci_dev *pdev; 1872 struct pci_dev *pdev;
1879 struct net_device *dev[2]; 1873 struct net_device *dev[2];
1880 spinlock_t hw_lock;
1881 u32 intr_mask;
1882 1874
1883 int pm_cap; 1875 int pm_cap;
1884 u8 chip_id; 1876 u8 chip_id;
@@ -1889,6 +1881,8 @@ struct sky2_hw {
1889 struct sky2_status_le *st_le; 1881 struct sky2_status_le *st_le;
1890 u32 st_idx; 1882 u32 st_idx;
1891 dma_addr_t st_dma; 1883 dma_addr_t st_dma;
1884 int msi_detected;
1885 wait_queue_head_t msi_wait;
1892}; 1886};
1893 1887
1894/* Register accessor for memory mapped device */ 1888/* Register accessor for memory mapped device */
diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c
index 75e9b3b910cc..0e9833adf9fe 100644
--- a/drivers/net/smc91x.c
+++ b/drivers/net/smc91x.c
@@ -215,15 +215,12 @@ struct smc_local {
215 215
216 spinlock_t lock; 216 spinlock_t lock;
217 217
218#ifdef SMC_CAN_USE_DATACS
219 u32 __iomem *datacs;
220#endif
221
222#ifdef SMC_USE_PXA_DMA 218#ifdef SMC_USE_PXA_DMA
223 /* DMA needs the physical address of the chip */ 219 /* DMA needs the physical address of the chip */
224 u_long physaddr; 220 u_long physaddr;
225#endif 221#endif
226 void __iomem *base; 222 void __iomem *base;
223 void __iomem *datacs;
227}; 224};
228 225
229#if SMC_DEBUG > 0 226#if SMC_DEBUG > 0
@@ -2104,9 +2101,8 @@ static int smc_enable_device(struct platform_device *pdev)
2104 * Set the appropriate byte/word mode. 2101 * Set the appropriate byte/word mode.
2105 */ 2102 */
2106 ecsr = readb(addr + (ECSR << SMC_IO_SHIFT)) & ~ECSR_IOIS8; 2103 ecsr = readb(addr + (ECSR << SMC_IO_SHIFT)) & ~ECSR_IOIS8;
2107#ifndef SMC_CAN_USE_16BIT 2104 if (!SMC_CAN_USE_16BIT)
2108 ecsr |= ECSR_IOIS8; 2105 ecsr |= ECSR_IOIS8;
2109#endif
2110 writeb(ecsr, addr + (ECSR << SMC_IO_SHIFT)); 2106 writeb(ecsr, addr + (ECSR << SMC_IO_SHIFT));
2111 local_irq_restore(flags); 2107 local_irq_restore(flags);
2112 2108
@@ -2143,40 +2139,39 @@ static void smc_release_attrib(struct platform_device *pdev)
2143 release_mem_region(res->start, ATTRIB_SIZE); 2139 release_mem_region(res->start, ATTRIB_SIZE);
2144} 2140}
2145 2141
2146#ifdef SMC_CAN_USE_DATACS 2142static inline void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev)
2147static void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev)
2148{ 2143{
2149 struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32"); 2144 if (SMC_CAN_USE_DATACS) {
2150 struct smc_local *lp = netdev_priv(ndev); 2145 struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32");
2146 struct smc_local *lp = netdev_priv(ndev);
2151 2147
2152 if (!res) 2148 if (!res)
2153 return; 2149 return;
2154 2150
2155 if(!request_mem_region(res->start, SMC_DATA_EXTENT, CARDNAME)) { 2151 if(!request_mem_region(res->start, SMC_DATA_EXTENT, CARDNAME)) {
2156 printk(KERN_INFO "%s: failed to request datacs memory region.\n", CARDNAME); 2152 printk(KERN_INFO "%s: failed to request datacs memory region.\n", CARDNAME);
2157 return; 2153 return;
2158 } 2154 }
2159 2155
2160 lp->datacs = ioremap(res->start, SMC_DATA_EXTENT); 2156 lp->datacs = ioremap(res->start, SMC_DATA_EXTENT);
2157 }
2161} 2158}
2162 2159
2163static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev) 2160static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev)
2164{ 2161{
2165 struct smc_local *lp = netdev_priv(ndev); 2162 if (SMC_CAN_USE_DATACS) {
2166 struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32"); 2163 struct smc_local *lp = netdev_priv(ndev);
2164 struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32");
2167 2165
2168 if (lp->datacs) 2166 if (lp->datacs)
2169 iounmap(lp->datacs); 2167 iounmap(lp->datacs);
2170 2168
2171 lp->datacs = NULL; 2169 lp->datacs = NULL;
2172 2170
2173 if (res) 2171 if (res)
2174 release_mem_region(res->start, SMC_DATA_EXTENT); 2172 release_mem_region(res->start, SMC_DATA_EXTENT);
2173 }
2175} 2174}
2176#else
2177static void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev) {}
2178static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev) {}
2179#endif
2180 2175
2181/* 2176/*
2182 * smc_init(void) 2177 * smc_init(void)
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index e0efd1964e72..e1be1af51201 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -275,7 +275,10 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
275#define SMC_insw(a,r,p,l) readsw ((void*) ((a) + (r)), p, l) 275#define SMC_insw(a,r,p,l) readsw ((void*) ((a) + (r)), p, l)
276#define SMC_outw(v,a,r) ({ writew ((v), (a) + (r)); LPD7A40X_IOBARRIER; }) 276#define SMC_outw(v,a,r) ({ writew ((v), (a) + (r)); LPD7A40X_IOBARRIER; })
277 277
278static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l) 278#define SMC_outsw LPD7A40X_SMC_outsw
279
280static inline void LPD7A40X_SMC_outsw(unsigned long a, int r,
281 unsigned char* p, int l)
279{ 282{
280 unsigned short* ps = (unsigned short*) p; 283 unsigned short* ps = (unsigned short*) p;
281 while (l-- > 0) { 284 while (l-- > 0) {
@@ -342,10 +345,6 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
342 345
343#endif 346#endif
344 347
345#ifndef SMC_IRQ_FLAGS
346#define SMC_IRQ_FLAGS SA_TRIGGER_RISING
347#endif
348
349#ifdef SMC_USE_PXA_DMA 348#ifdef SMC_USE_PXA_DMA
350/* 349/*
351 * Let's use the DMA engine on the XScale PXA2xx for RX packets. This is 350 * Let's use the DMA engine on the XScale PXA2xx for RX packets. This is
@@ -441,10 +440,85 @@ smc_pxa_dma_irq(int dma, void *dummy, struct pt_regs *regs)
441#endif /* SMC_USE_PXA_DMA */ 440#endif /* SMC_USE_PXA_DMA */
442 441
443 442
444/* Because of bank switching, the LAN91x uses only 16 I/O ports */ 443/*
444 * Everything a particular hardware setup needs should have been defined
445 * at this point. Add stubs for the undefined cases, mainly to avoid
446 * compilation warnings since they'll be optimized away, or to prevent buggy
447 * use of them.
448 */
449
450#if ! SMC_CAN_USE_32BIT
451#define SMC_inl(ioaddr, reg) ({ BUG(); 0; })
452#define SMC_outl(x, ioaddr, reg) BUG()
453#define SMC_insl(a, r, p, l) BUG()
454#define SMC_outsl(a, r, p, l) BUG()
455#endif
456
457#if !defined(SMC_insl) || !defined(SMC_outsl)
458#define SMC_insl(a, r, p, l) BUG()
459#define SMC_outsl(a, r, p, l) BUG()
460#endif
461
462#if ! SMC_CAN_USE_16BIT
463
464/*
465 * Any 16-bit access is performed with two 8-bit accesses if the hardware
466 * can't do it directly. Most registers are 16-bit so those are mandatory.
467 */
468#define SMC_outw(x, ioaddr, reg) \
469 do { \
470 unsigned int __val16 = (x); \
471 SMC_outb( __val16, ioaddr, reg ); \
472 SMC_outb( __val16 >> 8, ioaddr, reg + (1 << SMC_IO_SHIFT));\
473 } while (0)
474#define SMC_inw(ioaddr, reg) \
475 ({ \
476 unsigned int __val16; \
477 __val16 = SMC_inb( ioaddr, reg ); \
478 __val16 |= SMC_inb( ioaddr, reg + (1 << SMC_IO_SHIFT)) << 8; \
479 __val16; \
480 })
481
482#define SMC_insw(a, r, p, l) BUG()
483#define SMC_outsw(a, r, p, l) BUG()
484
485#endif
486
487#if !defined(SMC_insw) || !defined(SMC_outsw)
488#define SMC_insw(a, r, p, l) BUG()
489#define SMC_outsw(a, r, p, l) BUG()
490#endif
491
492#if ! SMC_CAN_USE_8BIT
493#define SMC_inb(ioaddr, reg) ({ BUG(); 0; })
494#define SMC_outb(x, ioaddr, reg) BUG()
495#define SMC_insb(a, r, p, l) BUG()
496#define SMC_outsb(a, r, p, l) BUG()
497#endif
498
499#if !defined(SMC_insb) || !defined(SMC_outsb)
500#define SMC_insb(a, r, p, l) BUG()
501#define SMC_outsb(a, r, p, l) BUG()
502#endif
503
504#ifndef SMC_CAN_USE_DATACS
505#define SMC_CAN_USE_DATACS 0
506#endif
507
445#ifndef SMC_IO_SHIFT 508#ifndef SMC_IO_SHIFT
446#define SMC_IO_SHIFT 0 509#define SMC_IO_SHIFT 0
447#endif 510#endif
511
512#ifndef SMC_IRQ_FLAGS
513#define SMC_IRQ_FLAGS SA_TRIGGER_RISING
514#endif
515
516#ifndef SMC_INTERRUPT_PREAMBLE
517#define SMC_INTERRUPT_PREAMBLE
518#endif
519
520
521/* Because of bank switching, the LAN91x uses only 16 I/O ports */
448#define SMC_IO_EXTENT (16 << SMC_IO_SHIFT) 522#define SMC_IO_EXTENT (16 << SMC_IO_SHIFT)
449#define SMC_DATA_EXTENT (4) 523#define SMC_DATA_EXTENT (4)
450 524
@@ -817,6 +891,11 @@ static const char * chip_ids[ 16 ] = {
817 * Note: the following macros do *not* select the bank -- this must 891 * Note: the following macros do *not* select the bank -- this must
818 * be done separately as needed in the main code. The SMC_REG() macro 892 * be done separately as needed in the main code. The SMC_REG() macro
819 * only uses the bank argument for debugging purposes (when enabled). 893 * only uses the bank argument for debugging purposes (when enabled).
894 *
895 * Note: despite inline functions being safer, everything leading to this
896 * should preferably be macros to let BUG() display the line number in
897 * the core source code since we're interested in the top call site
898 * not in any inline function location.
820 */ 899 */
821 900
822#if SMC_DEBUG > 0 901#if SMC_DEBUG > 0
@@ -834,62 +913,142 @@ static const char * chip_ids[ 16 ] = {
834#define SMC_REG(reg, bank) (reg<<SMC_IO_SHIFT) 913#define SMC_REG(reg, bank) (reg<<SMC_IO_SHIFT)
835#endif 914#endif
836 915
837#if SMC_CAN_USE_8BIT 916/*
838#define SMC_GET_PN() SMC_inb( ioaddr, PN_REG ) 917 * Hack Alert: Some setups just can't write 8 or 16 bits reliably when not
839#define SMC_SET_PN(x) SMC_outb( x, ioaddr, PN_REG ) 918 * aligned to a 32 bit boundary. I tell you that does exist!
840#define SMC_GET_AR() SMC_inb( ioaddr, AR_REG ) 919 * Fortunately the affected register accesses can be easily worked around
841#define SMC_GET_TXFIFO() SMC_inb( ioaddr, TXFIFO_REG ) 920 * since we can write zeroes to the preceeding 16 bits without adverse
842#define SMC_GET_RXFIFO() SMC_inb( ioaddr, RXFIFO_REG ) 921 * effects and use a 32-bit access.
843#define SMC_GET_INT() SMC_inb( ioaddr, INT_REG ) 922 *
844#define SMC_ACK_INT(x) SMC_outb( x, ioaddr, INT_REG ) 923 * Enforce it on any 32-bit capable setup for now.
845#define SMC_GET_INT_MASK() SMC_inb( ioaddr, IM_REG ) 924 */
846#define SMC_SET_INT_MASK(x) SMC_outb( x, ioaddr, IM_REG ) 925#define SMC_MUST_ALIGN_WRITE SMC_CAN_USE_32BIT
847#else 926
848#define SMC_GET_PN() (SMC_inw( ioaddr, PN_REG ) & 0xFF) 927#define SMC_GET_PN() \
849#define SMC_SET_PN(x) SMC_outw( x, ioaddr, PN_REG ) 928 ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, PN_REG)) \
850#define SMC_GET_AR() (SMC_inw( ioaddr, PN_REG ) >> 8) 929 : (SMC_inw(ioaddr, PN_REG) & 0xFF) )
851#define SMC_GET_TXFIFO() (SMC_inw( ioaddr, TXFIFO_REG ) & 0xFF) 930
852#define SMC_GET_RXFIFO() (SMC_inw( ioaddr, TXFIFO_REG ) >> 8) 931#define SMC_SET_PN(x) \
853#define SMC_GET_INT() (SMC_inw( ioaddr, INT_REG ) & 0xFF) 932 do { \
933 if (SMC_MUST_ALIGN_WRITE) \
934 SMC_outl((x)<<16, ioaddr, SMC_REG(0, 2)); \
935 else if (SMC_CAN_USE_8BIT) \
936 SMC_outb(x, ioaddr, PN_REG); \
937 else \
938 SMC_outw(x, ioaddr, PN_REG); \
939 } while (0)
940
941#define SMC_GET_AR() \
942 ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, AR_REG)) \
943 : (SMC_inw(ioaddr, PN_REG) >> 8) )
944
945#define SMC_GET_TXFIFO() \
946 ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, TXFIFO_REG)) \
947 : (SMC_inw(ioaddr, TXFIFO_REG) & 0xFF) )
948
949#define SMC_GET_RXFIFO() \
950 ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, RXFIFO_REG)) \
951 : (SMC_inw(ioaddr, TXFIFO_REG) >> 8) )
952
953#define SMC_GET_INT() \
954 ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, INT_REG)) \
955 : (SMC_inw(ioaddr, INT_REG) & 0xFF) )
956
854#define SMC_ACK_INT(x) \ 957#define SMC_ACK_INT(x) \
855 do { \ 958 do { \
856 unsigned long __flags; \ 959 if (SMC_CAN_USE_8BIT) \
857 int __mask; \ 960 SMC_outb(x, ioaddr, INT_REG); \
858 local_irq_save(__flags); \ 961 else { \
859 __mask = SMC_inw( ioaddr, INT_REG ) & ~0xff; \ 962 unsigned long __flags; \
860 SMC_outw( __mask | (x), ioaddr, INT_REG ); \ 963 int __mask; \
861 local_irq_restore(__flags); \ 964 local_irq_save(__flags); \
965 __mask = SMC_inw( ioaddr, INT_REG ) & ~0xff; \
966 SMC_outw( __mask | (x), ioaddr, INT_REG ); \
967 local_irq_restore(__flags); \
968 } \
969 } while (0)
970
971#define SMC_GET_INT_MASK() \
972 ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, IM_REG)) \
973 : (SMC_inw( ioaddr, INT_REG ) >> 8) )
974
975#define SMC_SET_INT_MASK(x) \
976 do { \
977 if (SMC_CAN_USE_8BIT) \
978 SMC_outb(x, ioaddr, IM_REG); \
979 else \
980 SMC_outw((x) << 8, ioaddr, INT_REG); \
981 } while (0)
982
983#define SMC_CURRENT_BANK() SMC_inw(ioaddr, BANK_SELECT)
984
985#define SMC_SELECT_BANK(x) \
986 do { \
987 if (SMC_MUST_ALIGN_WRITE) \
988 SMC_outl((x)<<16, ioaddr, 12<<SMC_IO_SHIFT); \
989 else \
990 SMC_outw(x, ioaddr, BANK_SELECT); \
991 } while (0)
992
993#define SMC_GET_BASE() SMC_inw(ioaddr, BASE_REG)
994
995#define SMC_SET_BASE(x) SMC_outw(x, ioaddr, BASE_REG)
996
997#define SMC_GET_CONFIG() SMC_inw(ioaddr, CONFIG_REG)
998
999#define SMC_SET_CONFIG(x) SMC_outw(x, ioaddr, CONFIG_REG)
1000
1001#define SMC_GET_COUNTER() SMC_inw(ioaddr, COUNTER_REG)
1002
1003#define SMC_GET_CTL() SMC_inw(ioaddr, CTL_REG)
1004
1005#define SMC_SET_CTL(x) SMC_outw(x, ioaddr, CTL_REG)
1006
1007#define SMC_GET_MII() SMC_inw(ioaddr, MII_REG)
1008
1009#define SMC_SET_MII(x) SMC_outw(x, ioaddr, MII_REG)
1010
1011#define SMC_GET_MIR() SMC_inw(ioaddr, MIR_REG)
1012
1013#define SMC_SET_MIR(x) SMC_outw(x, ioaddr, MIR_REG)
1014
1015#define SMC_GET_MMU_CMD() SMC_inw(ioaddr, MMU_CMD_REG)
1016
1017#define SMC_SET_MMU_CMD(x) SMC_outw(x, ioaddr, MMU_CMD_REG)
1018
1019#define SMC_GET_FIFO() SMC_inw(ioaddr, FIFO_REG)
1020
1021#define SMC_GET_PTR() SMC_inw(ioaddr, PTR_REG)
1022
1023#define SMC_SET_PTR(x) \
1024 do { \
1025 if (SMC_MUST_ALIGN_WRITE) \
1026 SMC_outl((x)<<16, ioaddr, SMC_REG(4, 2)); \
1027 else \
1028 SMC_outw(x, ioaddr, PTR_REG); \
862 } while (0) 1029 } while (0)
863#define SMC_GET_INT_MASK() (SMC_inw( ioaddr, INT_REG ) >> 8)
864#define SMC_SET_INT_MASK(x) SMC_outw( (x) << 8, ioaddr, INT_REG )
865#endif
866 1030
867#define SMC_CURRENT_BANK() SMC_inw( ioaddr, BANK_SELECT ) 1031#define SMC_GET_EPH_STATUS() SMC_inw(ioaddr, EPH_STATUS_REG)
868#define SMC_SELECT_BANK(x) SMC_outw( x, ioaddr, BANK_SELECT ) 1032
869#define SMC_GET_BASE() SMC_inw( ioaddr, BASE_REG ) 1033#define SMC_GET_RCR() SMC_inw(ioaddr, RCR_REG)
870#define SMC_SET_BASE(x) SMC_outw( x, ioaddr, BASE_REG ) 1034
871#define SMC_GET_CONFIG() SMC_inw( ioaddr, CONFIG_REG ) 1035#define SMC_SET_RCR(x) SMC_outw(x, ioaddr, RCR_REG)
872#define SMC_SET_CONFIG(x) SMC_outw( x, ioaddr, CONFIG_REG ) 1036
873#define SMC_GET_COUNTER() SMC_inw( ioaddr, COUNTER_REG ) 1037#define SMC_GET_REV() SMC_inw(ioaddr, REV_REG)
874#define SMC_GET_CTL() SMC_inw( ioaddr, CTL_REG ) 1038
875#define SMC_SET_CTL(x) SMC_outw( x, ioaddr, CTL_REG ) 1039#define SMC_GET_RPC() SMC_inw(ioaddr, RPC_REG)
876#define SMC_GET_MII() SMC_inw( ioaddr, MII_REG ) 1040
877#define SMC_SET_MII(x) SMC_outw( x, ioaddr, MII_REG ) 1041#define SMC_SET_RPC(x) \
878#define SMC_GET_MIR() SMC_inw( ioaddr, MIR_REG ) 1042 do { \
879#define SMC_SET_MIR(x) SMC_outw( x, ioaddr, MIR_REG ) 1043 if (SMC_MUST_ALIGN_WRITE) \
880#define SMC_GET_MMU_CMD() SMC_inw( ioaddr, MMU_CMD_REG ) 1044 SMC_outl((x)<<16, ioaddr, SMC_REG(8, 0)); \
881#define SMC_SET_MMU_CMD(x) SMC_outw( x, ioaddr, MMU_CMD_REG ) 1045 else \
882#define SMC_GET_FIFO() SMC_inw( ioaddr, FIFO_REG ) 1046 SMC_outw(x, ioaddr, RPC_REG); \
883#define SMC_GET_PTR() SMC_inw( ioaddr, PTR_REG ) 1047 } while (0)
884#define SMC_SET_PTR(x) SMC_outw( x, ioaddr, PTR_REG ) 1048
885#define SMC_GET_EPH_STATUS() SMC_inw( ioaddr, EPH_STATUS_REG ) 1049#define SMC_GET_TCR() SMC_inw(ioaddr, TCR_REG)
886#define SMC_GET_RCR() SMC_inw( ioaddr, RCR_REG ) 1050
887#define SMC_SET_RCR(x) SMC_outw( x, ioaddr, RCR_REG ) 1051#define SMC_SET_TCR(x) SMC_outw(x, ioaddr, TCR_REG)
888#define SMC_GET_REV() SMC_inw( ioaddr, REV_REG )
889#define SMC_GET_RPC() SMC_inw( ioaddr, RPC_REG )
890#define SMC_SET_RPC(x) SMC_outw( x, ioaddr, RPC_REG )
891#define SMC_GET_TCR() SMC_inw( ioaddr, TCR_REG )
892#define SMC_SET_TCR(x) SMC_outw( x, ioaddr, TCR_REG )
893 1052
894#ifndef SMC_GET_MAC_ADDR 1053#ifndef SMC_GET_MAC_ADDR
895#define SMC_GET_MAC_ADDR(addr) \ 1054#define SMC_GET_MAC_ADDR(addr) \
@@ -920,151 +1079,84 @@ static const char * chip_ids[ 16 ] = {
920 SMC_outw( mt[6] | (mt[7] << 8), ioaddr, MCAST_REG4 ); \ 1079 SMC_outw( mt[6] | (mt[7] << 8), ioaddr, MCAST_REG4 ); \
921 } while (0) 1080 } while (0)
922 1081
923#if SMC_CAN_USE_32BIT
924/*
925 * Some setups just can't write 8 or 16 bits reliably when not aligned
926 * to a 32 bit boundary. I tell you that exists!
927 * We re-do the ones here that can be easily worked around if they can have
928 * their low parts written to 0 without adverse effects.
929 */
930#undef SMC_SELECT_BANK
931#define SMC_SELECT_BANK(x) SMC_outl( (x)<<16, ioaddr, 12<<SMC_IO_SHIFT )
932#undef SMC_SET_RPC
933#define SMC_SET_RPC(x) SMC_outl( (x)<<16, ioaddr, SMC_REG(8, 0) )
934#undef SMC_SET_PN
935#define SMC_SET_PN(x) SMC_outl( (x)<<16, ioaddr, SMC_REG(0, 2) )
936#undef SMC_SET_PTR
937#define SMC_SET_PTR(x) SMC_outl( (x)<<16, ioaddr, SMC_REG(4, 2) )
938#endif
939
940#if SMC_CAN_USE_32BIT
941#define SMC_PUT_PKT_HDR(status, length) \
942 SMC_outl( (status) | (length) << 16, ioaddr, DATA_REG )
943#define SMC_GET_PKT_HDR(status, length) \
944 do { \
945 unsigned int __val = SMC_inl( ioaddr, DATA_REG ); \
946 (status) = __val & 0xffff; \
947 (length) = __val >> 16; \
948 } while (0)
949#else
950#define SMC_PUT_PKT_HDR(status, length) \ 1082#define SMC_PUT_PKT_HDR(status, length) \
951 do { \ 1083 do { \
952 SMC_outw( status, ioaddr, DATA_REG ); \ 1084 if (SMC_CAN_USE_32BIT) \
953 SMC_outw( length, ioaddr, DATA_REG ); \ 1085 SMC_outl((status) | (length)<<16, ioaddr, DATA_REG); \
954 } while (0) 1086 else { \
955#define SMC_GET_PKT_HDR(status, length) \ 1087 SMC_outw(status, ioaddr, DATA_REG); \
956 do { \ 1088 SMC_outw(length, ioaddr, DATA_REG); \
957 (status) = SMC_inw( ioaddr, DATA_REG ); \ 1089 } \
958 (length) = SMC_inw( ioaddr, DATA_REG ); \
959 } while (0) 1090 } while (0)
960#endif
961 1091
962#if SMC_CAN_USE_32BIT 1092#define SMC_GET_PKT_HDR(status, length) \
963#define _SMC_PUSH_DATA(p, l) \
964 do { \ 1093 do { \
965 char *__ptr = (p); \ 1094 if (SMC_CAN_USE_32BIT) { \
966 int __len = (l); \ 1095 unsigned int __val = SMC_inl(ioaddr, DATA_REG); \
967 if (__len >= 2 && (unsigned long)__ptr & 2) { \ 1096 (status) = __val & 0xffff; \
968 __len -= 2; \ 1097 (length) = __val >> 16; \
969 SMC_outw( *(u16 *)__ptr, ioaddr, DATA_REG ); \ 1098 } else { \
970 __ptr += 2; \ 1099 (status) = SMC_inw(ioaddr, DATA_REG); \
971 } \ 1100 (length) = SMC_inw(ioaddr, DATA_REG); \
972 SMC_outsl( ioaddr, DATA_REG, __ptr, __len >> 2); \
973 if (__len & 2) { \
974 __ptr += (__len & ~3); \
975 SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG ); \
976 } \ 1101 } \
977 } while (0) 1102 } while (0)
978#define _SMC_PULL_DATA(p, l) \
979 do { \
980 char *__ptr = (p); \
981 int __len = (l); \
982 if ((unsigned long)__ptr & 2) { \
983 /* \
984 * We want 32bit alignment here. \
985 * Since some buses perform a full 32bit \
986 * fetch even for 16bit data we can't use \
987 * SMC_inw() here. Back both source (on chip \
988 * and destination) pointers of 2 bytes. \
989 */ \
990 __ptr -= 2; \
991 __len += 2; \
992 SMC_SET_PTR( 2|PTR_READ|PTR_RCV|PTR_AUTOINC ); \
993 } \
994 __len += 2; \
995 SMC_insl( ioaddr, DATA_REG, __ptr, __len >> 2); \
996 } while (0)
997#elif SMC_CAN_USE_16BIT
998#define _SMC_PUSH_DATA(p, l) SMC_outsw( ioaddr, DATA_REG, p, (l) >> 1 )
999#define _SMC_PULL_DATA(p, l) SMC_insw ( ioaddr, DATA_REG, p, (l) >> 1 )
1000#elif SMC_CAN_USE_8BIT
1001#define _SMC_PUSH_DATA(p, l) SMC_outsb( ioaddr, DATA_REG, p, l )
1002#define _SMC_PULL_DATA(p, l) SMC_insb ( ioaddr, DATA_REG, p, l )
1003#endif
1004 1103
1005#if ! SMC_CAN_USE_16BIT 1104#define SMC_PUSH_DATA(p, l) \
1006#define SMC_outw(x, ioaddr, reg) \
1007 do { \ 1105 do { \
1008 unsigned int __val16 = (x); \ 1106 if (SMC_CAN_USE_32BIT) { \
1009 SMC_outb( __val16, ioaddr, reg ); \ 1107 void *__ptr = (p); \
1010 SMC_outb( __val16 >> 8, ioaddr, reg + (1 << SMC_IO_SHIFT));\ 1108 int __len = (l); \
1109 void *__ioaddr = ioaddr; \
1110 if (__len >= 2 && (unsigned long)__ptr & 2) { \
1111 __len -= 2; \
1112 SMC_outw(*(u16 *)__ptr, ioaddr, DATA_REG); \
1113 __ptr += 2; \
1114 } \
1115 if (SMC_CAN_USE_DATACS && lp->datacs) \
1116 __ioaddr = lp->datacs; \
1117 SMC_outsl(__ioaddr, DATA_REG, __ptr, __len>>2); \
1118 if (__len & 2) { \
1119 __ptr += (__len & ~3); \
1120 SMC_outw(*((u16 *)__ptr), ioaddr, DATA_REG); \
1121 } \
1122 } else if (SMC_CAN_USE_16BIT) \
1123 SMC_outsw(ioaddr, DATA_REG, p, (l) >> 1); \
1124 else if (SMC_CAN_USE_8BIT) \
1125 SMC_outsb(ioaddr, DATA_REG, p, l); \
1011 } while (0) 1126 } while (0)
1012#define SMC_inw(ioaddr, reg) \
1013 ({ \
1014 unsigned int __val16; \
1015 __val16 = SMC_inb( ioaddr, reg ); \
1016 __val16 |= SMC_inb( ioaddr, reg + (1 << SMC_IO_SHIFT)) << 8; \
1017 __val16; \
1018 })
1019#endif
1020
1021#ifdef SMC_CAN_USE_DATACS
1022#define SMC_PUSH_DATA(p, l) \
1023 if ( lp->datacs ) { \
1024 unsigned char *__ptr = (p); \
1025 int __len = (l); \
1026 if (__len >= 2 && (unsigned long)__ptr & 2) { \
1027 __len -= 2; \
1028 SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG ); \
1029 __ptr += 2; \
1030 } \
1031 outsl(lp->datacs, __ptr, __len >> 2); \
1032 if (__len & 2) { \
1033 __ptr += (__len & ~3); \
1034 SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG ); \
1035 } \
1036 } else { \
1037 _SMC_PUSH_DATA(p, l); \
1038 }
1039 1127
1040#define SMC_PULL_DATA(p, l) \ 1128#define SMC_PULL_DATA(p, l) \
1041 if ( lp->datacs ) { \ 1129 do { \
1042 unsigned char *__ptr = (p); \ 1130 if (SMC_CAN_USE_32BIT) { \
1043 int __len = (l); \ 1131 void *__ptr = (p); \
1044 if ((unsigned long)__ptr & 2) { \ 1132 int __len = (l); \
1045 /* \ 1133 void *__ioaddr = ioaddr; \
1046 * We want 32bit alignment here. \ 1134 if ((unsigned long)__ptr & 2) { \
1047 * Since some buses perform a full 32bit \ 1135 /* \
1048 * fetch even for 16bit data we can't use \ 1136 * We want 32bit alignment here. \
1049 * SMC_inw() here. Back both source (on chip \ 1137 * Since some buses perform a full \
1050 * and destination) pointers of 2 bytes. \ 1138 * 32bit fetch even for 16bit data \
1051 */ \ 1139 * we can't use SMC_inw() here. \
1052 __ptr -= 2; \ 1140 * Back both source (on-chip) and \
1141 * destination pointers of 2 bytes. \
1142 * This is possible since the call to \
1143 * SMC_GET_PKT_HDR() already advanced \
1144 * the source pointer of 4 bytes, and \
1145 * the skb_reserve(skb, 2) advanced \
1146 * the destination pointer of 2 bytes. \
1147 */ \
1148 __ptr -= 2; \
1149 __len += 2; \
1150 SMC_SET_PTR(2|PTR_READ|PTR_RCV|PTR_AUTOINC); \
1151 } \
1152 if (SMC_CAN_USE_DATACS && lp->datacs) \
1153 __ioaddr = lp->datacs; \
1053 __len += 2; \ 1154 __len += 2; \
1054 SMC_SET_PTR( 2|PTR_READ|PTR_RCV|PTR_AUTOINC ); \ 1155 SMC_insl(__ioaddr, DATA_REG, __ptr, __len>>2); \
1055 } \ 1156 } else if (SMC_CAN_USE_16BIT) \
1056 __len += 2; \ 1157 SMC_insw(ioaddr, DATA_REG, p, (l) >> 1); \
1057 insl( lp->datacs, __ptr, __len >> 2); \ 1158 else if (SMC_CAN_USE_8BIT) \
1058 } else { \ 1159 SMC_insb(ioaddr, DATA_REG, p, l); \
1059 _SMC_PULL_DATA(p, l); \ 1160 } while (0)
1060 }
1061#else
1062#define SMC_PUSH_DATA(p, l) _SMC_PUSH_DATA(p, l)
1063#define SMC_PULL_DATA(p, l) _SMC_PULL_DATA(p, l)
1064#endif
1065
1066#if !defined (SMC_INTERRUPT_PREAMBLE)
1067# define SMC_INTERRUPT_PREAMBLE
1068#endif
1069 1161
1070#endif /* _SMC91X_H_ */ 1162#endif /* _SMC91X_H_ */
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ff79e68b347c..7b82ff090d42 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -3639,7 +3639,7 @@ iscsi_tcp_init(void)
3639 3639
3640 taskcache = kmem_cache_create("iscsi_taskcache", 3640 taskcache = kmem_cache_create("iscsi_taskcache",
3641 sizeof(struct iscsi_data_task), 0, 3641 sizeof(struct iscsi_data_task), 0,
3642 SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL); 3642 SLAB_HWCACHE_ALIGN, NULL, NULL);
3643 if (!taskcache) 3643 if (!taskcache)
3644 return -ENOMEM; 3644 return -ENOMEM;
3645 3645
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index a8b05ce5de52..7405d0df95db 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1139,32 +1139,6 @@ sg_fasync(int fd, struct file *filp, int mode)
1139 return (retval < 0) ? retval : 0; 1139 return (retval < 0) ? retval : 0;
1140} 1140}
1141 1141
1142/* When startFinish==1 increments page counts for pages other than the
1143 first of scatter gather elements obtained from alloc_pages().
1144 When startFinish==0 decrements ... */
1145static void
1146sg_rb_correct4mmap(Sg_scatter_hold * rsv_schp, int startFinish)
1147{
1148 struct scatterlist *sg = rsv_schp->buffer;
1149 struct page *page;
1150 int k, m;
1151
1152 SCSI_LOG_TIMEOUT(3, printk("sg_rb_correct4mmap: startFinish=%d, scatg=%d\n",
1153 startFinish, rsv_schp->k_use_sg));
1154 /* N.B. correction _not_ applied to base page of each allocation */
1155 for (k = 0; k < rsv_schp->k_use_sg; ++k, ++sg) {
1156 for (m = PAGE_SIZE; m < sg->length; m += PAGE_SIZE) {
1157 page = sg->page;
1158 if (startFinish)
1159 get_page(page);
1160 else {
1161 if (page_count(page) > 0)
1162 __put_page(page);
1163 }
1164 }
1165 }
1166}
1167
1168static struct page * 1142static struct page *
1169sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type) 1143sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
1170{ 1144{
@@ -1236,10 +1210,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
1236 sa += len; 1210 sa += len;
1237 } 1211 }
1238 1212
1239 if (0 == sfp->mmap_called) { 1213 sfp->mmap_called = 1;
1240 sg_rb_correct4mmap(rsv_schp, 1); /* do only once per fd lifetime */
1241 sfp->mmap_called = 1;
1242 }
1243 vma->vm_flags |= VM_RESERVED; 1214 vma->vm_flags |= VM_RESERVED;
1244 vma->vm_private_data = sfp; 1215 vma->vm_private_data = sfp;
1245 vma->vm_ops = &sg_mmap_vm_ops; 1216 vma->vm_ops = &sg_mmap_vm_ops;
@@ -2388,8 +2359,6 @@ __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp)
2388 SCSI_LOG_TIMEOUT(6, 2359 SCSI_LOG_TIMEOUT(6,
2389 printk("__sg_remove_sfp: bufflen=%d, k_use_sg=%d\n", 2360 printk("__sg_remove_sfp: bufflen=%d, k_use_sg=%d\n",
2390 (int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg)); 2361 (int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg));
2391 if (sfp->mmap_called)
2392 sg_rb_correct4mmap(&sfp->reserve, 0); /* undo correction */
2393 sg_remove_scat(&sfp->reserve); 2362 sg_remove_scat(&sfp->reserve);
2394 } 2363 }
2395 sfp->parentdp = NULL; 2364 sfp->parentdp = NULL;
@@ -2471,9 +2440,9 @@ sg_page_malloc(int rqSz, int lowDma, int *retSzp)
2471 return resp; 2440 return resp;
2472 2441
2473 if (lowDma) 2442 if (lowDma)
2474 page_mask = GFP_ATOMIC | GFP_DMA | __GFP_NOWARN; 2443 page_mask = GFP_ATOMIC | GFP_DMA | __GFP_COMP | __GFP_NOWARN;
2475 else 2444 else
2476 page_mask = GFP_ATOMIC | __GFP_NOWARN; 2445 page_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
2477 2446
2478 for (order = 0, a_size = PAGE_SIZE; a_size < rqSz; 2447 for (order = 0, a_size = PAGE_SIZE; a_size < rqSz;
2479 order++, a_size <<= 1) ; 2448 order++, a_size <<= 1) ;
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 89e5413cc2a3..c66ef96c71b4 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -866,7 +866,7 @@ config SERIAL_M32R_PLDSIO
866 866
867config SERIAL_TXX9 867config SERIAL_TXX9
868 bool "TMPTX39XX/49XX SIO support" 868 bool "TMPTX39XX/49XX SIO support"
869 depends HAS_TXX9_SERIAL && BROKEN 869 depends HAS_TXX9_SERIAL
870 select SERIAL_CORE 870 select SERIAL_CORE
871 default y 871 default y
872 872
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index ee98a867bc6d..141173efd463 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -33,6 +33,10 @@
33 * 1.02 Cleanup. (import 8250.c changes) 33 * 1.02 Cleanup. (import 8250.c changes)
34 * 1.03 Fix low-latency mode. (import 8250.c changes) 34 * 1.03 Fix low-latency mode. (import 8250.c changes)
35 * 1.04 Remove usage of deprecated functions, cleanup. 35 * 1.04 Remove usage of deprecated functions, cleanup.
36 * 1.05 More strict check in verify_port. Cleanup.
37 * 1.06 Do not insert a char caused previous overrun.
38 * Fix some spin_locks.
39 * Do not call uart_add_one_port for absent ports.
36 */ 40 */
37#include <linux/config.h> 41#include <linux/config.h>
38 42
@@ -57,7 +61,7 @@
57#include <asm/io.h> 61#include <asm/io.h>
58#include <asm/irq.h> 62#include <asm/irq.h>
59 63
60static char *serial_version = "1.04"; 64static char *serial_version = "1.06";
61static char *serial_name = "TX39/49 Serial driver"; 65static char *serial_name = "TX39/49 Serial driver";
62 66
63#define PASS_LIMIT 256 67#define PASS_LIMIT 256
@@ -94,6 +98,8 @@ static char *serial_name = "TX39/49 Serial driver";
94#define UART_NR 4 98#define UART_NR 4
95#endif 99#endif
96 100
101#define HIGH_BITS_OFFSET ((sizeof(long)-sizeof(int))*8)
102
97struct uart_txx9_port { 103struct uart_txx9_port {
98 struct uart_port port; 104 struct uart_port port;
99 105
@@ -210,7 +216,7 @@ static inline unsigned int sio_in(struct uart_txx9_port *up, int offset)
210{ 216{
211 switch (up->port.iotype) { 217 switch (up->port.iotype) {
212 default: 218 default:
213 return *(volatile u32 *)(up->port.membase + offset); 219 return __raw_readl(up->port.membase + offset);
214 case UPIO_PORT: 220 case UPIO_PORT:
215 return inl(up->port.iobase + offset); 221 return inl(up->port.iobase + offset);
216 } 222 }
@@ -221,7 +227,7 @@ sio_out(struct uart_txx9_port *up, int offset, int value)
221{ 227{
222 switch (up->port.iotype) { 228 switch (up->port.iotype) {
223 default: 229 default:
224 *(volatile u32 *)(up->port.membase + offset) = value; 230 __raw_writel(value, up->port.membase + offset);
225 break; 231 break;
226 case UPIO_PORT: 232 case UPIO_PORT:
227 outl(value, up->port.iobase + offset); 233 outl(value, up->port.iobase + offset);
@@ -259,34 +265,19 @@ sio_quot_set(struct uart_txx9_port *up, int quot)
259static void serial_txx9_stop_tx(struct uart_port *port) 265static void serial_txx9_stop_tx(struct uart_port *port)
260{ 266{
261 struct uart_txx9_port *up = (struct uart_txx9_port *)port; 267 struct uart_txx9_port *up = (struct uart_txx9_port *)port;
262 unsigned long flags;
263
264 spin_lock_irqsave(&up->port.lock, flags);
265 sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_TIE); 268 sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_TIE);
266 spin_unlock_irqrestore(&up->port.lock, flags);
267} 269}
268 270
269static void serial_txx9_start_tx(struct uart_port *port) 271static void serial_txx9_start_tx(struct uart_port *port)
270{ 272{
271 struct uart_txx9_port *up = (struct uart_txx9_port *)port; 273 struct uart_txx9_port *up = (struct uart_txx9_port *)port;
272 unsigned long flags;
273
274 spin_lock_irqsave(&up->port.lock, flags);
275 sio_set(up, TXX9_SIDICR, TXX9_SIDICR_TIE); 274 sio_set(up, TXX9_SIDICR, TXX9_SIDICR_TIE);
276 spin_unlock_irqrestore(&up->port.lock, flags);
277} 275}
278 276
279static void serial_txx9_stop_rx(struct uart_port *port) 277static void serial_txx9_stop_rx(struct uart_port *port)
280{ 278{
281 struct uart_txx9_port *up = (struct uart_txx9_port *)port; 279 struct uart_txx9_port *up = (struct uart_txx9_port *)port;
282 unsigned long flags;
283
284 spin_lock_irqsave(&up->port.lock, flags);
285 up->port.read_status_mask &= ~TXX9_SIDISR_RDIS; 280 up->port.read_status_mask &= ~TXX9_SIDISR_RDIS;
286#if 0
287 sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_RIE);
288#endif
289 spin_unlock_irqrestore(&up->port.lock, flags);
290} 281}
291 282
292static void serial_txx9_enable_ms(struct uart_port *port) 283static void serial_txx9_enable_ms(struct uart_port *port)
@@ -302,12 +293,16 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
302 unsigned int disr = *status; 293 unsigned int disr = *status;
303 int max_count = 256; 294 int max_count = 256;
304 char flag; 295 char flag;
296 unsigned int next_ignore_status_mask;
305 297
306 do { 298 do {
307 ch = sio_in(up, TXX9_SIRFIFO); 299 ch = sio_in(up, TXX9_SIRFIFO);
308 flag = TTY_NORMAL; 300 flag = TTY_NORMAL;
309 up->port.icount.rx++; 301 up->port.icount.rx++;
310 302
303 /* mask out RFDN_MASK bit added by previous overrun */
304 next_ignore_status_mask =
305 up->port.ignore_status_mask & ~TXX9_SIDISR_RFDN_MASK;
311 if (unlikely(disr & (TXX9_SIDISR_UBRK | TXX9_SIDISR_UPER | 306 if (unlikely(disr & (TXX9_SIDISR_UBRK | TXX9_SIDISR_UPER |
312 TXX9_SIDISR_UFER | TXX9_SIDISR_UOER))) { 307 TXX9_SIDISR_UFER | TXX9_SIDISR_UOER))) {
313 /* 308 /*
@@ -328,8 +323,17 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
328 up->port.icount.parity++; 323 up->port.icount.parity++;
329 else if (disr & TXX9_SIDISR_UFER) 324 else if (disr & TXX9_SIDISR_UFER)
330 up->port.icount.frame++; 325 up->port.icount.frame++;
331 if (disr & TXX9_SIDISR_UOER) 326 if (disr & TXX9_SIDISR_UOER) {
332 up->port.icount.overrun++; 327 up->port.icount.overrun++;
328 /*
329 * The receiver read buffer still hold
330 * a char which caused overrun.
331 * Ignore next char by adding RFDN_MASK
332 * to ignore_status_mask temporarily.
333 */
334 next_ignore_status_mask |=
335 TXX9_SIDISR_RFDN_MASK;
336 }
333 337
334 /* 338 /*
335 * Mask off conditions which should be ingored. 339 * Mask off conditions which should be ingored.
@@ -349,6 +353,7 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
349 uart_insert_char(&up->port, disr, TXX9_SIDISR_UOER, ch, flag); 353 uart_insert_char(&up->port, disr, TXX9_SIDISR_UOER, ch, flag);
350 354
351 ignore_char: 355 ignore_char:
356 up->port.ignore_status_mask = next_ignore_status_mask;
352 disr = sio_in(up, TXX9_SIDISR); 357 disr = sio_in(up, TXX9_SIDISR);
353 } while (!(disr & TXX9_SIDISR_UVALID) && (max_count-- > 0)); 358 } while (!(disr & TXX9_SIDISR_UVALID) && (max_count-- > 0));
354 spin_unlock(&up->port.lock); 359 spin_unlock(&up->port.lock);
@@ -450,14 +455,11 @@ static unsigned int serial_txx9_get_mctrl(struct uart_port *port)
450static void serial_txx9_set_mctrl(struct uart_port *port, unsigned int mctrl) 455static void serial_txx9_set_mctrl(struct uart_port *port, unsigned int mctrl)
451{ 456{
452 struct uart_txx9_port *up = (struct uart_txx9_port *)port; 457 struct uart_txx9_port *up = (struct uart_txx9_port *)port;
453 unsigned long flags;
454 458
455 spin_lock_irqsave(&up->port.lock, flags);
456 if (mctrl & TIOCM_RTS) 459 if (mctrl & TIOCM_RTS)
457 sio_mask(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC); 460 sio_mask(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC);
458 else 461 else
459 sio_set(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC); 462 sio_set(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC);
460 spin_unlock_irqrestore(&up->port.lock, flags);
461} 463}
462 464
463static void serial_txx9_break_ctl(struct uart_port *port, int break_state) 465static void serial_txx9_break_ctl(struct uart_port *port, int break_state)
@@ -784,8 +786,14 @@ static void serial_txx9_config_port(struct uart_port *port, int uflags)
784static int 786static int
785serial_txx9_verify_port(struct uart_port *port, struct serial_struct *ser) 787serial_txx9_verify_port(struct uart_port *port, struct serial_struct *ser)
786{ 788{
787 if (ser->irq < 0 || 789 unsigned long new_port = ser->port;
788 ser->baud_base < 9600 || ser->type != PORT_TXX9) 790 if (HIGH_BITS_OFFSET)
791 new_port += (unsigned long)ser->port_high << HIGH_BITS_OFFSET;
792 if (ser->type != port->type ||
793 ser->irq != port->irq ||
794 ser->io_type != port->iotype ||
795 new_port != port->iobase ||
796 (unsigned long)ser->iomem_base != port->mapbase)
789 return -EINVAL; 797 return -EINVAL;
790 return 0; 798 return 0;
791} 799}
@@ -827,7 +835,8 @@ static void __init serial_txx9_register_ports(struct uart_driver *drv)
827 835
828 up->port.line = i; 836 up->port.line = i;
829 up->port.ops = &serial_txx9_pops; 837 up->port.ops = &serial_txx9_pops;
830 uart_add_one_port(drv, &up->port); 838 if (up->port.iobase || up->port.mapbase)
839 uart_add_one_port(drv, &up->port);
831 } 840 }
832} 841}
833 842
@@ -927,11 +936,6 @@ static int serial_txx9_console_setup(struct console *co, char *options)
927 return -ENODEV; 936 return -ENODEV;
928 937
929 /* 938 /*
930 * Temporary fix.
931 */
932 spin_lock_init(&port->lock);
933
934 /*
935 * Disable UART interrupts, set DTR and RTS high 939 * Disable UART interrupts, set DTR and RTS high
936 * and set speed. 940 * and set speed.
937 */ 941 */
@@ -1041,11 +1045,10 @@ static int __devinit serial_txx9_register_port(struct uart_port *port)
1041 mutex_lock(&serial_txx9_mutex); 1045 mutex_lock(&serial_txx9_mutex);
1042 for (i = 0; i < UART_NR; i++) { 1046 for (i = 0; i < UART_NR; i++) {
1043 uart = &serial_txx9_ports[i]; 1047 uart = &serial_txx9_ports[i];
1044 if (uart->port.type == PORT_UNKNOWN) 1048 if (!(uart->port.iobase || uart->port.mapbase))
1045 break; 1049 break;
1046 } 1050 }
1047 if (i < UART_NR) { 1051 if (i < UART_NR) {
1048 uart_remove_one_port(&serial_txx9_reg, &uart->port);
1049 uart->port.iobase = port->iobase; 1052 uart->port.iobase = port->iobase;
1050 uart->port.membase = port->membase; 1053 uart->port.membase = port->membase;
1051 uart->port.irq = port->irq; 1054 uart->port.irq = port->irq;
@@ -1080,9 +1083,8 @@ static void __devexit serial_txx9_unregister_port(int line)
1080 uart->port.type = PORT_UNKNOWN; 1083 uart->port.type = PORT_UNKNOWN;
1081 uart->port.iobase = 0; 1084 uart->port.iobase = 0;
1082 uart->port.mapbase = 0; 1085 uart->port.mapbase = 0;
1083 uart->port.membase = 0; 1086 uart->port.membase = NULL;
1084 uart->port.dev = NULL; 1087 uart->port.dev = NULL;
1085 uart_add_one_port(&serial_txx9_reg, &uart->port);
1086 mutex_unlock(&serial_txx9_mutex); 1088 mutex_unlock(&serial_txx9_mutex);
1087} 1089}
1088 1090
@@ -1198,8 +1200,11 @@ static void __exit serial_txx9_exit(void)
1198#ifdef ENABLE_SERIAL_TXX9_PCI 1200#ifdef ENABLE_SERIAL_TXX9_PCI
1199 pci_unregister_driver(&serial_txx9_pci_driver); 1201 pci_unregister_driver(&serial_txx9_pci_driver);
1200#endif 1202#endif
1201 for (i = 0; i < UART_NR; i++) 1203 for (i = 0; i < UART_NR; i++) {
1202 uart_remove_one_port(&serial_txx9_reg, &serial_txx9_ports[i].port); 1204 struct uart_txx9_port *up = &serial_txx9_ports[i];
1205 if (up->port.iobase || up->port.mapbase)
1206 uart_remove_one_port(&serial_txx9_reg, &up->port);
1207 }
1203 1208
1204 uart_unregister_driver(&serial_txx9_reg); 1209 uart_unregister_driver(&serial_txx9_reg);
1205} 1210}
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
index d61494d185cd..bd6294132c18 100644
--- a/drivers/serial/vr41xx_siu.c
+++ b/drivers/serial/vr41xx_siu.c
@@ -919,7 +919,7 @@ static struct uart_driver siu_uart_driver = {
919 .cons = SERIAL_VR41XX_CONSOLE, 919 .cons = SERIAL_VR41XX_CONSOLE,
920}; 920};
921 921
922static int siu_probe(struct platform_device *dev) 922static int __devinit siu_probe(struct platform_device *dev)
923{ 923{
924 struct uart_port *port; 924 struct uart_port *port;
925 int num, i, retval; 925 int num, i, retval;
@@ -953,7 +953,7 @@ static int siu_probe(struct platform_device *dev)
953 return 0; 953 return 0;
954} 954}
955 955
956static int siu_remove(struct platform_device *dev) 956static int __devexit siu_remove(struct platform_device *dev)
957{ 957{
958 struct uart_port *port; 958 struct uart_port *port;
959 int i; 959 int i;
@@ -1006,21 +1006,28 @@ static struct platform_device *siu_platform_device;
1006 1006
1007static struct platform_driver siu_device_driver = { 1007static struct platform_driver siu_device_driver = {
1008 .probe = siu_probe, 1008 .probe = siu_probe,
1009 .remove = siu_remove, 1009 .remove = __devexit_p(siu_remove),
1010 .suspend = siu_suspend, 1010 .suspend = siu_suspend,
1011 .resume = siu_resume, 1011 .resume = siu_resume,
1012 .driver = { 1012 .driver = {
1013 .name = "SIU", 1013 .name = "SIU",
1014 .owner = THIS_MODULE,
1014 }, 1015 },
1015}; 1016};
1016 1017
1017static int __devinit vr41xx_siu_init(void) 1018static int __init vr41xx_siu_init(void)
1018{ 1019{
1019 int retval; 1020 int retval;
1020 1021
1021 siu_platform_device = platform_device_register_simple("SIU", -1, NULL, 0); 1022 siu_platform_device = platform_device_alloc("SIU", -1);
1022 if (IS_ERR(siu_platform_device)) 1023 if (!siu_platform_device)
1023 return PTR_ERR(siu_platform_device); 1024 return -ENOMEM;
1025
1026 retval = platform_device_add(siu_platform_device);
1027 if (retval < 0) {
1028 platform_device_put(siu_platform_device);
1029 return retval;
1030 }
1024 1031
1025 retval = platform_driver_register(&siu_device_driver); 1032 retval = platform_driver_register(&siu_device_driver);
1026 if (retval < 0) 1033 if (retval < 0)
@@ -1029,10 +1036,9 @@ static int __devinit vr41xx_siu_init(void)
1029 return retval; 1036 return retval;
1030} 1037}
1031 1038
1032static void __devexit vr41xx_siu_exit(void) 1039static void __exit vr41xx_siu_exit(void)
1033{ 1040{
1034 platform_driver_unregister(&siu_device_driver); 1041 platform_driver_unregister(&siu_device_driver);
1035
1036 platform_device_unregister(siu_platform_device); 1042 platform_device_unregister(siu_platform_device);
1037} 1043}
1038 1044
diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c
index ea75b3d0612b..67140a5804f5 100644
--- a/drivers/sn/ioc4.c
+++ b/drivers/sn/ioc4.c
@@ -31,7 +31,7 @@
31#include <linux/ioc4.h> 31#include <linux/ioc4.h>
32#include <linux/mmtimer.h> 32#include <linux/mmtimer.h>
33#include <linux/rtc.h> 33#include <linux/rtc.h>
34#include <linux/rwsem.h> 34#include <linux/mutex.h>
35#include <asm/sn/addrs.h> 35#include <asm/sn/addrs.h>
36#include <asm/sn/clksupport.h> 36#include <asm/sn/clksupport.h>
37#include <asm/sn/shub_mmr.h> 37#include <asm/sn/shub_mmr.h>
@@ -54,11 +54,10 @@
54 * Submodule management * 54 * Submodule management *
55 ************************/ 55 ************************/
56 56
57static LIST_HEAD(ioc4_devices); 57static DEFINE_MUTEX(ioc4_mutex);
58static DECLARE_RWSEM(ioc4_devices_rwsem);
59 58
59static LIST_HEAD(ioc4_devices);
60static LIST_HEAD(ioc4_submodules); 60static LIST_HEAD(ioc4_submodules);
61static DECLARE_RWSEM(ioc4_submodules_rwsem);
62 61
63/* Register an IOC4 submodule */ 62/* Register an IOC4 submodule */
64int 63int
@@ -66,15 +65,13 @@ ioc4_register_submodule(struct ioc4_submodule *is)
66{ 65{
67 struct ioc4_driver_data *idd; 66 struct ioc4_driver_data *idd;
68 67
69 down_write(&ioc4_submodules_rwsem); 68 mutex_lock(&ioc4_mutex);
70 list_add(&is->is_list, &ioc4_submodules); 69 list_add(&is->is_list, &ioc4_submodules);
71 up_write(&ioc4_submodules_rwsem);
72 70
73 /* Initialize submodule for each IOC4 */ 71 /* Initialize submodule for each IOC4 */
74 if (!is->is_probe) 72 if (!is->is_probe)
75 return 0; 73 goto out;
76 74
77 down_read(&ioc4_devices_rwsem);
78 list_for_each_entry(idd, &ioc4_devices, idd_list) { 75 list_for_each_entry(idd, &ioc4_devices, idd_list) {
79 if (is->is_probe(idd)) { 76 if (is->is_probe(idd)) {
80 printk(KERN_WARNING 77 printk(KERN_WARNING
@@ -84,8 +81,8 @@ ioc4_register_submodule(struct ioc4_submodule *is)
84 pci_name(idd->idd_pdev)); 81 pci_name(idd->idd_pdev));
85 } 82 }
86 } 83 }
87 up_read(&ioc4_devices_rwsem); 84 out:
88 85 mutex_unlock(&ioc4_mutex);
89 return 0; 86 return 0;
90} 87}
91 88
@@ -95,15 +92,13 @@ ioc4_unregister_submodule(struct ioc4_submodule *is)
95{ 92{
96 struct ioc4_driver_data *idd; 93 struct ioc4_driver_data *idd;
97 94
98 down_write(&ioc4_submodules_rwsem); 95 mutex_lock(&ioc4_mutex);
99 list_del(&is->is_list); 96 list_del(&is->is_list);
100 up_write(&ioc4_submodules_rwsem);
101 97
102 /* Remove submodule for each IOC4 */ 98 /* Remove submodule for each IOC4 */
103 if (!is->is_remove) 99 if (!is->is_remove)
104 return; 100 goto out;
105 101
106 down_read(&ioc4_devices_rwsem);
107 list_for_each_entry(idd, &ioc4_devices, idd_list) { 102 list_for_each_entry(idd, &ioc4_devices, idd_list) {
108 if (is->is_remove(idd)) { 103 if (is->is_remove(idd)) {
109 printk(KERN_WARNING 104 printk(KERN_WARNING
@@ -113,7 +108,8 @@ ioc4_unregister_submodule(struct ioc4_submodule *is)
113 pci_name(idd->idd_pdev)); 108 pci_name(idd->idd_pdev));
114 } 109 }
115 } 110 }
116 up_read(&ioc4_devices_rwsem); 111 out:
112 mutex_unlock(&ioc4_mutex);
117} 113}
118 114
119/********************* 115/*********************
@@ -312,12 +308,11 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
312 /* Track PCI-device specific data */ 308 /* Track PCI-device specific data */
313 idd->idd_serial_data = NULL; 309 idd->idd_serial_data = NULL;
314 pci_set_drvdata(idd->idd_pdev, idd); 310 pci_set_drvdata(idd->idd_pdev, idd);
315 down_write(&ioc4_devices_rwsem); 311
312 mutex_lock(&ioc4_mutex);
316 list_add(&idd->idd_list, &ioc4_devices); 313 list_add(&idd->idd_list, &ioc4_devices);
317 up_write(&ioc4_devices_rwsem);
318 314
319 /* Add this IOC4 to all submodules */ 315 /* Add this IOC4 to all submodules */
320 down_read(&ioc4_submodules_rwsem);
321 list_for_each_entry(is, &ioc4_submodules, is_list) { 316 list_for_each_entry(is, &ioc4_submodules, is_list) {
322 if (is->is_probe && is->is_probe(idd)) { 317 if (is->is_probe && is->is_probe(idd)) {
323 printk(KERN_WARNING 318 printk(KERN_WARNING
@@ -327,7 +322,7 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
327 pci_name(idd->idd_pdev)); 322 pci_name(idd->idd_pdev));
328 } 323 }
329 } 324 }
330 up_read(&ioc4_submodules_rwsem); 325 mutex_unlock(&ioc4_mutex);
331 326
332 return 0; 327 return 0;
333 328
@@ -351,7 +346,7 @@ ioc4_remove(struct pci_dev *pdev)
351 idd = pci_get_drvdata(pdev); 346 idd = pci_get_drvdata(pdev);
352 347
353 /* Remove this IOC4 from all submodules */ 348 /* Remove this IOC4 from all submodules */
354 down_read(&ioc4_submodules_rwsem); 349 mutex_lock(&ioc4_mutex);
355 list_for_each_entry(is, &ioc4_submodules, is_list) { 350 list_for_each_entry(is, &ioc4_submodules, is_list) {
356 if (is->is_remove && is->is_remove(idd)) { 351 if (is->is_remove && is->is_remove(idd)) {
357 printk(KERN_WARNING 352 printk(KERN_WARNING
@@ -361,7 +356,7 @@ ioc4_remove(struct pci_dev *pdev)
361 pci_name(idd->idd_pdev)); 356 pci_name(idd->idd_pdev));
362 } 357 }
363 } 358 }
364 up_read(&ioc4_submodules_rwsem); 359 mutex_unlock(&ioc4_mutex);
365 360
366 /* Release resources */ 361 /* Release resources */
367 iounmap(idd->idd_misc_regs); 362 iounmap(idd->idd_misc_regs);
@@ -377,9 +372,9 @@ ioc4_remove(struct pci_dev *pdev)
377 pci_disable_device(pdev); 372 pci_disable_device(pdev);
378 373
379 /* Remove and free driver data */ 374 /* Remove and free driver data */
380 down_write(&ioc4_devices_rwsem); 375 mutex_lock(&ioc4_mutex);
381 list_del(&idd->idd_list); 376 list_del(&idd->idd_list);
382 up_write(&ioc4_devices_rwsem); 377 mutex_unlock(&ioc4_mutex);
383 kfree(idd); 378 kfree(idd);
384} 379}
385 380
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index b058273527bb..76448d6ae896 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end)
1269 */ 1269 */
1270 page = virt_to_page(virtual_start); 1270 page = virt_to_page(virtual_start);
1271 ClearPageReserved(page); 1271 ClearPageReserved(page);
1272 set_page_count(page, 1); 1272 init_page_count(page);
1273 free_page(virtual_start); 1273 free_page(virtual_start);
1274 1274
1275 virtual_start += PAGE_SIZE; 1275 virtual_start += PAGE_SIZE;
diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c
index d8467c03b49f..788297e9d59e 100644
--- a/drivers/video/i810/i810_main.c
+++ b/drivers/video/i810/i810_main.c
@@ -1508,7 +1508,7 @@ static int i810fb_cursor(struct fb_info *info, struct fb_cursor *cursor)
1508 int size = ((cursor->image.width + 7) >> 3) * 1508 int size = ((cursor->image.width + 7) >> 3) *
1509 cursor->image.height; 1509 cursor->image.height;
1510 int i; 1510 int i;
1511 u8 *data = kmalloc(64 * 8, GFP_KERNEL); 1511 u8 *data = kmalloc(64 * 8, GFP_ATOMIC);
1512 1512
1513 if (data == NULL) 1513 if (data == NULL)
1514 return -ENOMEM; 1514 return -ENOMEM;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3ad8455f8577..651a9e14d9a9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -614,6 +614,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
614 614
615 sb = dir->i_sb; 615 sb = dir->i_sb;
616 v9ses = v9fs_inode2v9ses(dir); 616 v9ses = v9fs_inode2v9ses(dir);
617 dentry->d_op = &v9fs_dentry_operations;
617 dirfid = v9fs_fid_lookup(dentry->d_parent); 618 dirfid = v9fs_fid_lookup(dentry->d_parent);
618 619
619 if (!dirfid) { 620 if (!dirfid) {
@@ -681,8 +682,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
681 goto FreeFcall; 682 goto FreeFcall;
682 683
683 fid->qid = fcall->params.rstat.stat.qid; 684 fid->qid = fcall->params.rstat.stat.qid;
684
685 dentry->d_op = &v9fs_dentry_operations;
686 v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); 685 v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
687 686
688 d_add(dentry, inode); 687 d_add(dentry, inode);
diff --git a/fs/buffer.c b/fs/buffer.c
index a9b399402007..1d3683d496f8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3051,68 +3051,6 @@ asmlinkage long sys_bdflush(int func, long data)
3051} 3051}
3052 3052
3053/* 3053/*
3054 * Migration function for pages with buffers. This function can only be used
3055 * if the underlying filesystem guarantees that no other references to "page"
3056 * exist.
3057 */
3058#ifdef CONFIG_MIGRATION
3059int buffer_migrate_page(struct page *newpage, struct page *page)
3060{
3061 struct address_space *mapping = page->mapping;
3062 struct buffer_head *bh, *head;
3063 int rc;
3064
3065 if (!mapping)
3066 return -EAGAIN;
3067
3068 if (!page_has_buffers(page))
3069 return migrate_page(newpage, page);
3070
3071 head = page_buffers(page);
3072
3073 rc = migrate_page_remove_references(newpage, page, 3);
3074 if (rc)
3075 return rc;
3076
3077 bh = head;
3078 do {
3079 get_bh(bh);
3080 lock_buffer(bh);
3081 bh = bh->b_this_page;
3082
3083 } while (bh != head);
3084
3085 ClearPagePrivate(page);
3086 set_page_private(newpage, page_private(page));
3087 set_page_private(page, 0);
3088 put_page(page);
3089 get_page(newpage);
3090
3091 bh = head;
3092 do {
3093 set_bh_page(bh, newpage, bh_offset(bh));
3094 bh = bh->b_this_page;
3095
3096 } while (bh != head);
3097
3098 SetPagePrivate(newpage);
3099
3100 migrate_page_copy(newpage, page);
3101
3102 bh = head;
3103 do {
3104 unlock_buffer(bh);
3105 put_bh(bh);
3106 bh = bh->b_this_page;
3107
3108 } while (bh != head);
3109
3110 return 0;
3111}
3112EXPORT_SYMBOL(buffer_migrate_page);
3113#endif
3114
3115/*
3116 * Buffer-head allocation 3054 * Buffer-head allocation
3117 */ 3055 */
3118static kmem_cache_t *bh_cachep; 3056static kmem_cache_t *bh_cachep;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b35195289945..25fa8bba8cb5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
56 pagevec_reinit(pvec); 56 pagevec_reinit(pvec);
57} 57}
58 58
59/*
60 * huge_pages_needed tries to determine the number of new huge pages that
61 * will be required to fully populate this VMA. This will be equal to
62 * the size of the VMA in huge pages minus the number of huge pages
63 * (covered by this VMA) that are found in the page cache.
64 *
65 * Result is in bytes to be compatible with is_hugepage_mem_enough()
66 */
67static unsigned long
68huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
69{
70 int i;
71 struct pagevec pvec;
72 unsigned long start = vma->vm_start;
73 unsigned long end = vma->vm_end;
74 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
75 pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
76 pgoff_t endpg = next + hugepages;
77
78 pagevec_init(&pvec, 0);
79 while (next < endpg) {
80 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
81 break;
82 for (i = 0; i < pagevec_count(&pvec); i++) {
83 struct page *page = pvec.pages[i];
84 if (page->index > next)
85 next = page->index;
86 if (page->index >= endpg)
87 break;
88 next++;
89 hugepages--;
90 }
91 huge_pagevec_release(&pvec);
92 }
93 return hugepages << HPAGE_SHIFT;
94}
95
96static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 59static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
97{ 60{
98 struct inode *inode = file->f_dentry->d_inode; 61 struct inode *inode = file->f_dentry->d_inode;
99 struct address_space *mapping = inode->i_mapping; 62 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
100 unsigned long bytes;
101 loff_t len, vma_len; 63 loff_t len, vma_len;
102 int ret; 64 int ret;
103 65
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
113 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 75 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
114 return -EINVAL; 76 return -EINVAL;
115 77
116 bytes = huge_pages_needed(mapping, vma);
117 if (!is_hugepage_mem_enough(bytes))
118 return -ENOMEM;
119
120 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 78 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
121 79
122 mutex_lock(&inode->i_mutex); 80 mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
129 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 87 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
130 goto out; 88 goto out;
131 89
90 if (vma->vm_flags & VM_MAYSHARE)
91 if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
92 goto out;
93
132 ret = 0; 94 ret = 0;
133 hugetlb_prefault_arch_hook(vma->vm_mm); 95 hugetlb_prefault_arch_hook(vma->vm_mm);
134 if (inode->i_size < len) 96 if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
227 put_page(page); 189 put_page(page);
228} 190}
229 191
230static void truncate_hugepages(struct address_space *mapping, loff_t lstart) 192static void truncate_hugepages(struct inode *inode, loff_t lstart)
231{ 193{
194 struct address_space *mapping = &inode->i_data;
232 const pgoff_t start = lstart >> HPAGE_SHIFT; 195 const pgoff_t start = lstart >> HPAGE_SHIFT;
233 struct pagevec pvec; 196 struct pagevec pvec;
234 pgoff_t next; 197 pgoff_t next;
235 int i; 198 int i;
236 199
200 hugetlb_truncate_reservation(HUGETLBFS_I(inode),
201 lstart >> HPAGE_SHIFT);
202 if (!mapping->nrpages)
203 return;
237 pagevec_init(&pvec, 0); 204 pagevec_init(&pvec, 0);
238 next = start; 205 next = start;
239 while (1) { 206 while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
262 229
263static void hugetlbfs_delete_inode(struct inode *inode) 230static void hugetlbfs_delete_inode(struct inode *inode)
264{ 231{
265 if (inode->i_data.nrpages) 232 truncate_hugepages(inode, 0);
266 truncate_hugepages(&inode->i_data, 0);
267 clear_inode(inode); 233 clear_inode(inode);
268} 234}
269 235
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
296 inode->i_state |= I_FREEING; 262 inode->i_state |= I_FREEING;
297 inodes_stat.nr_inodes--; 263 inodes_stat.nr_inodes--;
298 spin_unlock(&inode_lock); 264 spin_unlock(&inode_lock);
299 if (inode->i_data.nrpages) 265 truncate_hugepages(inode, 0);
300 truncate_hugepages(&inode->i_data, 0);
301 clear_inode(inode); 266 clear_inode(inode);
302 destroy_inode(inode); 267 destroy_inode(inode);
303} 268}
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
356 if (!prio_tree_empty(&mapping->i_mmap)) 321 if (!prio_tree_empty(&mapping->i_mmap))
357 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 322 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
358 spin_unlock(&mapping->i_mmap_lock); 323 spin_unlock(&mapping->i_mmap_lock);
359 truncate_hugepages(mapping, offset); 324 truncate_hugepages(inode, offset);
360 return 0; 325 return 0;
361} 326}
362 327
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
573 hugetlbfs_inc_free_inodes(sbinfo); 538 hugetlbfs_inc_free_inodes(sbinfo);
574 return NULL; 539 return NULL;
575 } 540 }
541 p->prereserved_hpages = 0;
576 return &p->vfs_inode; 542 return &p->vfs_inode;
577} 543}
578 544
@@ -771,21 +737,6 @@ static struct file_system_type hugetlbfs_fs_type = {
771 737
772static struct vfsmount *hugetlbfs_vfsmount; 738static struct vfsmount *hugetlbfs_vfsmount;
773 739
774/*
775 * Return the next identifier for a shm file
776 */
777static unsigned long hugetlbfs_counter(void)
778{
779 static DEFINE_SPINLOCK(lock);
780 static unsigned long counter;
781 unsigned long ret;
782
783 spin_lock(&lock);
784 ret = ++counter;
785 spin_unlock(&lock);
786 return ret;
787}
788
789static int can_do_hugetlb_shm(void) 740static int can_do_hugetlb_shm(void)
790{ 741{
791 return likely(capable(CAP_IPC_LOCK) || 742 return likely(capable(CAP_IPC_LOCK) ||
@@ -801,18 +752,16 @@ struct file *hugetlb_zero_setup(size_t size)
801 struct dentry *dentry, *root; 752 struct dentry *dentry, *root;
802 struct qstr quick_string; 753 struct qstr quick_string;
803 char buf[16]; 754 char buf[16];
755 static atomic_t counter;
804 756
805 if (!can_do_hugetlb_shm()) 757 if (!can_do_hugetlb_shm())
806 return ERR_PTR(-EPERM); 758 return ERR_PTR(-EPERM);
807 759
808 if (!is_hugepage_mem_enough(size))
809 return ERR_PTR(-ENOMEM);
810
811 if (!user_shm_lock(size, current->user)) 760 if (!user_shm_lock(size, current->user))
812 return ERR_PTR(-ENOMEM); 761 return ERR_PTR(-ENOMEM);
813 762
814 root = hugetlbfs_vfsmount->mnt_root; 763 root = hugetlbfs_vfsmount->mnt_root;
815 snprintf(buf, 16, "%lu", hugetlbfs_counter()); 764 snprintf(buf, 16, "%u", atomic_inc_return(&counter));
816 quick_string.name = buf; 765 quick_string.name = buf;
817 quick_string.len = strlen(quick_string.name); 766 quick_string.len = strlen(quick_string.name);
818 quick_string.hash = 0; 767 quick_string.hash = 0;
@@ -831,6 +780,11 @@ struct file *hugetlb_zero_setup(size_t size)
831 if (!inode) 780 if (!inode)
832 goto out_file; 781 goto out_file;
833 782
783 error = -ENOMEM;
784 if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
785 size >> HPAGE_SHIFT) != 0)
786 goto out_inode;
787
834 d_instantiate(dentry, inode); 788 d_instantiate(dentry, inode);
835 inode->i_size = size; 789 inode->i_size = size;
836 inode->i_nlink = 0; 790 inode->i_nlink = 0;
@@ -841,6 +795,8 @@ struct file *hugetlb_zero_setup(size_t size)
841 file->f_mode = FMODE_WRITE | FMODE_READ; 795 file->f_mode = FMODE_WRITE | FMODE_READ;
842 return file; 796 return file;
843 797
798out_inode:
799 iput(inode);
844out_file: 800out_file:
845 put_filp(file); 801 put_filp(file);
846out_dentry: 802out_dentry:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8dd3aafec499..09e1c57a86a0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -959,7 +959,7 @@ static int ocfs2_initialize_mem_caches(void)
959 ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", 959 ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
960 sizeof(struct ocfs2_journal_lock), 960 sizeof(struct ocfs2_journal_lock),
961 0, 961 0,
962 SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, 962 SLAB_HWCACHE_ALIGN,
963 NULL, NULL); 963 NULL, NULL);
964 if (!ocfs2_lock_cache) 964 if (!ocfs2_lock_cache)
965 return -ENOMEM; 965 return -ENOMEM;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 3f810acd0bfa..b1ca234068f6 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -87,8 +87,7 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
87 xpages = 1UL << order; 87 xpages = 1UL << order;
88 npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; 88 npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
89 89
90 for (loop = 0; loop < npages; loop++) 90 split_page(pages, order);
91 set_page_count(pages + loop, 1);
92 91
93 /* trim off any pages we don't actually require */ 92 /* trim off any pages we don't actually require */
94 for (loop = npages; loop < xpages; loop++) 93 for (loop = npages; loop < xpages; loop++)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bfb4f2917bb6..8cdfa4151659 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -29,6 +29,7 @@
29#include <linux/blkdev.h> 29#include <linux/blkdev.h>
30#include <linux/hash.h> 30#include <linux/hash.h>
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include <linux/migrate.h>
32#include "xfs_linux.h" 33#include "xfs_linux.h"
33 34
34STATIC kmem_zone_t *xfs_buf_zone; 35STATIC kmem_zone_t *xfs_buf_zone;
diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h
index 55059abf9c95..20f523954218 100644
--- a/include/asm-i386/acpi.h
+++ b/include/asm-i386/acpi.h
@@ -103,6 +103,12 @@ __acpi_release_global_lock (unsigned int *lock)
103 :"=r"(n_hi), "=r"(n_lo) \ 103 :"=r"(n_hi), "=r"(n_lo) \
104 :"0"(n_hi), "1"(n_lo)) 104 :"0"(n_hi), "1"(n_lo))
105 105
106#ifdef CONFIG_X86_IO_APIC
107extern void check_acpi_pci(void);
108#else
109static inline void check_acpi_pci(void) { }
110#endif
111
106#ifdef CONFIG_ACPI 112#ifdef CONFIG_ACPI
107extern int acpi_lapic; 113extern int acpi_lapic;
108extern int acpi_ioapic; 114extern int acpi_ioapic;
@@ -128,8 +134,6 @@ extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
128extern int skip_ioapic_setup; 134extern int skip_ioapic_setup;
129extern int acpi_skip_timer_override; 135extern int acpi_skip_timer_override;
130 136
131extern void check_acpi_pci(void);
132
133static inline void disable_ioapic_setup(void) 137static inline void disable_ioapic_setup(void)
134{ 138{
135 skip_ioapic_setup = 1; 139 skip_ioapic_setup = 1;
@@ -142,8 +146,6 @@ static inline int ioapic_setup_disabled(void)
142 146
143#else 147#else
144static inline void disable_ioapic_setup(void) { } 148static inline void disable_ioapic_setup(void) { }
145static inline void check_acpi_pci(void) { }
146
147#endif 149#endif
148 150
149static inline void acpi_noirq_set(void) { acpi_noirq = 1; } 151static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 088a945bf26b..ee056c41a9fb 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -219,13 +219,12 @@ extern unsigned long pg0[];
219 * The following only work if pte_present() is true. 219 * The following only work if pte_present() is true.
220 * Undefined behaviour if not.. 220 * Undefined behaviour if not..
221 */ 221 */
222#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
223static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } 222static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
224static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } 223static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
225static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } 224static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
226static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } 225static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
227static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } 226static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
228static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; } 227static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
229 228
230/* 229/*
231 * The following only works if pte_present() is not true. 230 * The following only works if pte_present() is not true.
@@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return
242static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } 241static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
243static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } 242static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
244static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } 243static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
245static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= __LARGE_PTE; return pte; } 244static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
246 245
247#ifdef CONFIG_X86_PAE 246#ifdef CONFIG_X86_PAE
248# include <asm/pgtable-3level.h> 247# include <asm/pgtable-3level.h>
diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index a7122d850177..d069b6acddce 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -5,113 +5,10 @@
5 * 5 *
6 * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com> 6 * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
7 * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com> 7 * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
8 * Copyright (C) 2005,2006 Hongjiu Lu <hongjiu.lu@intel.com>
8 * 9 *
9 */ 10 */
10#include <asm/types.h> 11#include <ia64intrin.h>
11
12void __lfetch(int lfhint, void *y);
13void __lfetch_excl(int lfhint, void *y);
14void __lfetch_fault(int lfhint, void *y);
15void __lfetch_fault_excl(int lfhint, void *y);
16
17/* In the following, whichFloatReg should be an integer from 0-127 */
18void __ldfs(const int whichFloatReg, void *src);
19void __ldfd(const int whichFloatReg, void *src);
20void __ldfe(const int whichFloatReg, void *src);
21void __ldf8(const int whichFloatReg, void *src);
22void __ldf_fill(const int whichFloatReg, void *src);
23void __stfs(void *dst, const int whichFloatReg);
24void __stfd(void *dst, const int whichFloatReg);
25void __stfe(void *dst, const int whichFloatReg);
26void __stf8(void *dst, const int whichFloatReg);
27void __stf_spill(void *dst, const int whichFloatReg);
28
29void __st1_rel(void *dst, const __s8 value);
30void __st2_rel(void *dst, const __s16 value);
31void __st4_rel(void *dst, const __s32 value);
32void __st8_rel(void *dst, const __s64 value);
33__u8 __ld1_acq(void *src);
34__u16 __ld2_acq(void *src);
35__u32 __ld4_acq(void *src);
36__u64 __ld8_acq(void *src);
37
38__u64 __fetchadd4_acq(__u32 *addend, const int increment);
39__u64 __fetchadd4_rel(__u32 *addend, const int increment);
40__u64 __fetchadd8_acq(__u64 *addend, const int increment);
41__u64 __fetchadd8_rel(__u64 *addend, const int increment);
42
43__u64 __getf_exp(double d);
44
45/* OS Related Itanium(R) Intrinsics */
46
47/* The names to use for whichReg and whichIndReg below come from
48 the include file asm/ia64regs.h */
49
50__u64 __getIndReg(const int whichIndReg, __s64 index);
51__u64 __getReg(const int whichReg);
52
53void __setIndReg(const int whichIndReg, __s64 index, __u64 value);
54void __setReg(const int whichReg, __u64 value);
55
56void __mf(void);
57void __mfa(void);
58void __synci(void);
59void __itcd(__s64 pa);
60void __itci(__s64 pa);
61void __itrd(__s64 whichTransReg, __s64 pa);
62void __itri(__s64 whichTransReg, __s64 pa);
63void __ptce(__s64 va);
64void __ptcl(__s64 va, __s64 pagesz);
65void __ptcg(__s64 va, __s64 pagesz);
66void __ptcga(__s64 va, __s64 pagesz);
67void __ptri(__s64 va, __s64 pagesz);
68void __ptrd(__s64 va, __s64 pagesz);
69void __invala (void);
70void __invala_gr(const int whichGeneralReg /* 0-127 */ );
71void __invala_fr(const int whichFloatReg /* 0-127 */ );
72void __nop(const int);
73void __fc(__u64 *addr);
74void __sum(int mask);
75void __rum(int mask);
76void __ssm(int mask);
77void __rsm(int mask);
78__u64 __thash(__s64);
79__u64 __ttag(__s64);
80__s64 __tpa(__s64);
81
82/* Intrinsics for implementing get/put_user macros */
83void __st_user(const char *tableName, __u64 addr, char size, char relocType, __u64 val);
84void __ld_user(const char *tableName, __u64 addr, char size, char relocType);
85
86/* This intrinsic does not generate code, it creates a barrier across which
87 * the compiler will not schedule data access instructions.
88 */
89void __memory_barrier(void);
90
91void __isrlz(void);
92void __dsrlz(void);
93
94__u64 _m64_mux1(__u64 a, const int n);
95__u64 __thash(__u64);
96
97/* Lock and Atomic Operation Related Intrinsics */
98__u64 _InterlockedExchange8(volatile __u8 *trgt, __u8 value);
99__u64 _InterlockedExchange16(volatile __u16 *trgt, __u16 value);
100__s64 _InterlockedExchange(volatile __u32 *trgt, __u32 value);
101__s64 _InterlockedExchange64(volatile __u64 *trgt, __u64 value);
102
103__u64 _InterlockedCompareExchange8_rel(volatile __u8 *dest, __u64 xchg, __u64 comp);
104__u64 _InterlockedCompareExchange8_acq(volatile __u8 *dest, __u64 xchg, __u64 comp);
105__u64 _InterlockedCompareExchange16_rel(volatile __u16 *dest, __u64 xchg, __u64 comp);
106__u64 _InterlockedCompareExchange16_acq(volatile __u16 *dest, __u64 xchg, __u64 comp);
107__u64 _InterlockedCompareExchange_rel(volatile __u32 *dest, __u64 xchg, __u64 comp);
108__u64 _InterlockedCompareExchange_acq(volatile __u32 *dest, __u64 xchg, __u64 comp);
109__u64 _InterlockedCompareExchange64_rel(volatile __u64 *dest, __u64 xchg, __u64 comp);
110__u64 _InterlockedCompareExchange64_acq(volatile __u64 *dest, __u64 xchg, __u64 comp);
111
112__s64 _m64_dep_mi(const int v, __s64 s, const int p, const int len);
113__s64 _m64_shrp(__s64 a, __s64 b, const int count);
114__s64 _m64_popcnt(__s64 a);
115 12
116#define ia64_barrier() __memory_barrier() 13#define ia64_barrier() __memory_barrier()
117 14
@@ -122,15 +19,16 @@ __s64 _m64_popcnt(__s64 a);
122#define ia64_getreg __getReg 19#define ia64_getreg __getReg
123#define ia64_setreg __setReg 20#define ia64_setreg __setReg
124 21
125#define ia64_hint(x) 22#define ia64_hint __hint
23#define ia64_hint_pause __hint_pause
126 24
127#define ia64_mux1_brcst 0 25#define ia64_mux1_brcst _m64_mux1_brcst
128#define ia64_mux1_mix 8 26#define ia64_mux1_mix _m64_mux1_mix
129#define ia64_mux1_shuf 9 27#define ia64_mux1_shuf _m64_mux1_shuf
130#define ia64_mux1_alt 10 28#define ia64_mux1_alt _m64_mux1_alt
131#define ia64_mux1_rev 11 29#define ia64_mux1_rev _m64_mux1_rev
132 30
133#define ia64_mux1 _m64_mux1 31#define ia64_mux1(x,v) _m_to_int64(_m64_mux1(_m_from_int64(x), (v)))
134#define ia64_popcnt _m64_popcnt 32#define ia64_popcnt _m64_popcnt
135#define ia64_getf_exp __getf_exp 33#define ia64_getf_exp __getf_exp
136#define ia64_shrp _m64_shrp 34#define ia64_shrp _m64_shrp
@@ -158,7 +56,7 @@ __s64 _m64_popcnt(__s64 a);
158#define ia64_stf8 __stf8 56#define ia64_stf8 __stf8
159#define ia64_stf_spill __stf_spill 57#define ia64_stf_spill __stf_spill
160 58
161#define ia64_mf __mf 59#define ia64_mf __mf
162#define ia64_mfa __mfa 60#define ia64_mfa __mfa
163 61
164#define ia64_fetchadd4_acq __fetchadd4_acq 62#define ia64_fetchadd4_acq __fetchadd4_acq
@@ -234,10 +132,10 @@ __s64 _m64_popcnt(__s64 a);
234 132
235/* Values for lfhint in __lfetch and __lfetch_fault */ 133/* Values for lfhint in __lfetch and __lfetch_fault */
236 134
237#define ia64_lfhint_none 0 135#define ia64_lfhint_none __lfhint_none
238#define ia64_lfhint_nt1 1 136#define ia64_lfhint_nt1 __lfhint_nt1
239#define ia64_lfhint_nt2 2 137#define ia64_lfhint_nt2 __lfhint_nt2
240#define ia64_lfhint_nta 3 138#define ia64_lfhint_nta __lfhint_nta
241 139
242#define ia64_lfetch __lfetch 140#define ia64_lfetch __lfetch
243#define ia64_lfetch_excl __lfetch_excl 141#define ia64_lfetch_excl __lfetch_excl
@@ -254,4 +152,6 @@ do { \
254 } \ 152 } \
255} while (0) 153} while (0)
256 154
155#define __builtin_trap() __break(0);
156
257#endif /* _ASM_IA64_INTEL_INTRIN_H */ 157#endif /* _ASM_IA64_INTEL_INTRIN_H */
diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
index ca5ea994d688..c3e4ed8a3e17 100644
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -20,6 +20,7 @@ struct scatterlist;
20struct page; 20struct page;
21struct mm_struct; 21struct mm_struct;
22struct pci_bus; 22struct pci_bus;
23struct task_struct;
23 24
24typedef void ia64_mv_setup_t (char **); 25typedef void ia64_mv_setup_t (char **);
25typedef void ia64_mv_cpu_init_t (void); 26typedef void ia64_mv_cpu_init_t (void);
@@ -34,6 +35,7 @@ typedef int ia64_mv_pci_legacy_read_t (struct pci_bus *, u16 port, u32 *val,
34 u8 size); 35 u8 size);
35typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val, 36typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val,
36 u8 size); 37 u8 size);
38typedef void ia64_mv_migrate_t(struct task_struct * task);
37 39
38/* DMA-mapping interface: */ 40/* DMA-mapping interface: */
39typedef void ia64_mv_dma_init (void); 41typedef void ia64_mv_dma_init (void);
@@ -85,6 +87,11 @@ machvec_noop_mm (struct mm_struct *mm)
85{ 87{
86} 88}
87 89
90static inline void
91machvec_noop_task (struct task_struct *task)
92{
93}
94
88extern void machvec_setup (char **); 95extern void machvec_setup (char **);
89extern void machvec_timer_interrupt (int, void *, struct pt_regs *); 96extern void machvec_timer_interrupt (int, void *, struct pt_regs *);
90extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int); 97extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int);
@@ -146,6 +153,7 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
146# define platform_readw_relaxed ia64_mv.readw_relaxed 153# define platform_readw_relaxed ia64_mv.readw_relaxed
147# define platform_readl_relaxed ia64_mv.readl_relaxed 154# define platform_readl_relaxed ia64_mv.readl_relaxed
148# define platform_readq_relaxed ia64_mv.readq_relaxed 155# define platform_readq_relaxed ia64_mv.readq_relaxed
156# define platform_migrate ia64_mv.migrate
149# endif 157# endif
150 158
151/* __attribute__((__aligned__(16))) is required to make size of the 159/* __attribute__((__aligned__(16))) is required to make size of the
@@ -194,6 +202,7 @@ struct ia64_machine_vector {
194 ia64_mv_readw_relaxed_t *readw_relaxed; 202 ia64_mv_readw_relaxed_t *readw_relaxed;
195 ia64_mv_readl_relaxed_t *readl_relaxed; 203 ia64_mv_readl_relaxed_t *readl_relaxed;
196 ia64_mv_readq_relaxed_t *readq_relaxed; 204 ia64_mv_readq_relaxed_t *readq_relaxed;
205 ia64_mv_migrate_t *migrate;
197} __attribute__((__aligned__(16))); /* align attrib? see above comment */ 206} __attribute__((__aligned__(16))); /* align attrib? see above comment */
198 207
199#define MACHVEC_INIT(name) \ 208#define MACHVEC_INIT(name) \
@@ -238,6 +247,7 @@ struct ia64_machine_vector {
238 platform_readw_relaxed, \ 247 platform_readw_relaxed, \
239 platform_readl_relaxed, \ 248 platform_readl_relaxed, \
240 platform_readq_relaxed, \ 249 platform_readq_relaxed, \
250 platform_migrate, \
241} 251}
242 252
243extern struct ia64_machine_vector ia64_mv; 253extern struct ia64_machine_vector ia64_mv;
@@ -386,5 +396,8 @@ extern ia64_mv_dma_supported swiotlb_dma_supported;
386#ifndef platform_readq_relaxed 396#ifndef platform_readq_relaxed
387# define platform_readq_relaxed __ia64_readq_relaxed 397# define platform_readq_relaxed __ia64_readq_relaxed
388#endif 398#endif
399#ifndef platform_migrate
400# define platform_migrate machvec_noop_task
401#endif
389 402
390#endif /* _ASM_IA64_MACHVEC_H */ 403#endif /* _ASM_IA64_MACHVEC_H */
diff --git a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
index 03d00faf03b5..da1d43755afe 100644
--- a/include/asm-ia64/machvec_sn2.h
+++ b/include/asm-ia64/machvec_sn2.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2002-2003, 2006 Silicon Graphics, Inc. All Rights Reserved. 2 * Copyright (c) 2002-2003,2006 Silicon Graphics, Inc. All Rights Reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License 5 * under the terms of version 2 of the GNU General Public License
@@ -66,6 +66,7 @@ extern ia64_mv_dma_sync_single_for_device sn_dma_sync_single_for_device;
66extern ia64_mv_dma_sync_sg_for_device sn_dma_sync_sg_for_device; 66extern ia64_mv_dma_sync_sg_for_device sn_dma_sync_sg_for_device;
67extern ia64_mv_dma_mapping_error sn_dma_mapping_error; 67extern ia64_mv_dma_mapping_error sn_dma_mapping_error;
68extern ia64_mv_dma_supported sn_dma_supported; 68extern ia64_mv_dma_supported sn_dma_supported;
69extern ia64_mv_migrate_t sn_migrate;
69 70
70/* 71/*
71 * This stuff has dual use! 72 * This stuff has dual use!
@@ -115,6 +116,7 @@ extern ia64_mv_dma_supported sn_dma_supported;
115#define platform_dma_sync_sg_for_device sn_dma_sync_sg_for_device 116#define platform_dma_sync_sg_for_device sn_dma_sync_sg_for_device
116#define platform_dma_mapping_error sn_dma_mapping_error 117#define platform_dma_mapping_error sn_dma_mapping_error
117#define platform_dma_supported sn_dma_supported 118#define platform_dma_supported sn_dma_supported
119#define platform_migrate sn_migrate
118 120
119#include <asm/sn/io.h> 121#include <asm/sn/io.h>
120 122
diff --git a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h
index c7d9c9ed38ba..bfbbb8da79c7 100644
--- a/include/asm-ia64/mca.h
+++ b/include/asm-ia64/mca.h
@@ -131,6 +131,8 @@ struct ia64_mca_cpu {
131/* Array of physical addresses of each CPU's MCA area. */ 131/* Array of physical addresses of each CPU's MCA area. */
132extern unsigned long __per_cpu_mca[NR_CPUS]; 132extern unsigned long __per_cpu_mca[NR_CPUS];
133 133
134extern int cpe_vector;
135extern int ia64_cpe_irq;
134extern void ia64_mca_init(void); 136extern void ia64_mca_init(void);
135extern void ia64_mca_cpu_init(void *); 137extern void ia64_mca_cpu_init(void *);
136extern void ia64_os_mca_dispatch(void); 138extern void ia64_os_mca_dispatch(void);
diff --git a/include/asm-ia64/mutex.h b/include/asm-ia64/mutex.h
index 458c1f7fbc18..5a3224f6af38 100644
--- a/include/asm-ia64/mutex.h
+++ b/include/asm-ia64/mutex.h
@@ -1,9 +1,92 @@
1/* 1/*
2 * Pull in the generic implementation for the mutex fastpath. 2 * ia64 implementation of the mutex fastpath.
3 * 3 *
4 * TODO: implement optimized primitives instead, or leave the generic 4 * Copyright (C) 2006 Ken Chen <kenneth.w.chen@intel.com>
5 * implementation in place, or pick the atomic_xchg() based generic 5 *
6 * implementation. (see asm-generic/mutex-xchg.h for details) 6 */
7
8#ifndef _ASM_MUTEX_H
9#define _ASM_MUTEX_H
10
11/**
12 * __mutex_fastpath_lock - try to take the lock by moving the count
13 * from 1 to a 0 value
14 * @count: pointer of type atomic_t
15 * @fail_fn: function to call if the original value was not 1
16 *
17 * Change the count from 1 to a value lower than 1, and call <fail_fn> if
18 * it wasn't 1 originally. This function MUST leave the value lower than
19 * 1 even when the "1" assertion wasn't true.
20 */
21static inline void
22__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
23{
24 if (unlikely(ia64_fetchadd4_acq(count, -1) != 1))
25 fail_fn(count);
26}
27
28/**
29 * __mutex_fastpath_lock_retval - try to take the lock by moving the count
30 * from 1 to a 0 value
31 * @count: pointer of type atomic_t
32 * @fail_fn: function to call if the original value was not 1
33 *
34 * Change the count from 1 to a value lower than 1, and call <fail_fn> if
35 * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
36 * or anything the slow path function returns.
37 */
38static inline int
39__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
40{
41 if (unlikely(ia64_fetchadd4_acq(count, -1) != 1))
42 return fail_fn(count);
43 return 0;
44}
45
46/**
47 * __mutex_fastpath_unlock - try to promote the count from 0 to 1
48 * @count: pointer of type atomic_t
49 * @fail_fn: function to call if the original value was not 0
50 *
51 * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
52 * In the failure case, this function is allowed to either set the value to
53 * 1, or to set it to a value lower than 1.
54 *
55 * If the implementation sets it to a value of lower than 1, then the
56 * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
57 * to return 0 otherwise.
58 */
59static inline void
60__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
61{
62 int ret = ia64_fetchadd4_rel(count, 1);
63 if (unlikely(ret < 0))
64 fail_fn(count);
65}
66
67#define __mutex_slowpath_needs_to_unlock() 1
68
69/**
70 * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
71 *
72 * @count: pointer of type atomic_t
73 * @fail_fn: fallback function
74 *
75 * Change the count from 1 to a value lower than 1, and return 0 (failure)
76 * if it wasn't 1 originally, or return 1 (success) otherwise. This function
77 * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
78 * Additionally, if the value was < 0 originally, this function must not leave
79 * it to 0 on failure.
80 *
81 * If the architecture has no effective trylock variant, it should call the
82 * <fail_fn> spinlock-based trylock variant unconditionally.
7 */ 83 */
84static inline int
85__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
86{
87 if (likely(cmpxchg_acq(count, 1, 0)) == 1)
88 return 1;
89 return 0;
90}
8 91
9#include <asm-generic/mutex-dec.h> 92#endif
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 5e6362a786b7..3ab27333dae4 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -57,6 +57,8 @@
57 57
58# define HAVE_ARCH_HUGETLB_UNMAPPED_AREA 58# define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
59# define ARCH_HAS_HUGEPAGE_ONLY_RANGE 59# define ARCH_HAS_HUGEPAGE_ONLY_RANGE
60# define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
61# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
60#endif /* CONFIG_HUGETLB_PAGE */ 62#endif /* CONFIG_HUGETLB_PAGE */
61 63
62#ifdef __ASSEMBLY__ 64#ifdef __ASSEMBLY__
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index e2560c58384b..c0f8144f2349 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr)
314#define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A)) 314#define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A))
315#define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D)) 315#define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D))
316#define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D)) 316#define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D))
317#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_P)) 317#define pte_mkhuge(pte) (__pte(pte_val(pte)))
318 318
319/* 319/*
320 * Macro to a page protection value as "uncacheable". Note that "protection" is really a 320 * Macro to a page protection value as "uncacheable". Note that "protection" is really a
@@ -505,9 +505,6 @@ extern struct page *zero_page_memmap_ptr;
505#define HUGETLB_PGDIR_SHIFT (HPAGE_SHIFT + 2*(PAGE_SHIFT-3)) 505#define HUGETLB_PGDIR_SHIFT (HPAGE_SHIFT + 2*(PAGE_SHIFT-3))
506#define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT) 506#define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
507#define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1)) 507#define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1))
508struct mmu_gather;
509void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
510 unsigned long end, unsigned long floor, unsigned long ceiling);
511#endif 508#endif
512 509
513/* 510/*
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 23c8e1be1911..128fefd8056f 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -50,7 +50,8 @@
50#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */ 50#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */
51#define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */ 51#define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */
52#define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */ 52#define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */
53 /* bit 5 is currently unused */ 53#define IA64_THREAD_MIGRATION (__IA64_UL(1) << 5) /* require migration
54 sync at ctx sw */
54#define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6) /* don't log any fpswa faults */ 55#define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6) /* don't log any fpswa faults */
55#define IA64_THREAD_FPEMU_SIGFPE (__IA64_UL(1) << 7) /* send a SIGFPE for fpswa faults */ 56#define IA64_THREAD_FPEMU_SIGFPE (__IA64_UL(1) << 7) /* send a SIGFPE for fpswa faults */
56 57
diff --git a/include/asm-ia64/signal.h b/include/asm-ia64/signal.h
index 608168d713d3..5e328ed5d01d 100644
--- a/include/asm-ia64/signal.h
+++ b/include/asm-ia64/signal.h
@@ -158,8 +158,6 @@ struct k_sigaction {
158 158
159#define ptrace_signal_deliver(regs, cookie) do { } while (0) 159#define ptrace_signal_deliver(regs, cookie) do { } while (0)
160 160
161void set_sigdelayed(pid_t pid, int signo, int code, void __user *addr);
162
163#endif /* __KERNEL__ */ 161#endif /* __KERNEL__ */
164 162
165# endif /* !__ASSEMBLY__ */ 163# endif /* !__ASSEMBLY__ */
diff --git a/include/asm-ia64/sn/addrs.h b/include/asm-ia64/sn/addrs.h
index 2c32e4b77b54..1d9efe541662 100644
--- a/include/asm-ia64/sn/addrs.h
+++ b/include/asm-ia64/sn/addrs.h
@@ -283,5 +283,13 @@
283#define REMOTE_HUB_L(n, a) HUB_L(REMOTE_HUB_ADDR((n), (a))) 283#define REMOTE_HUB_L(n, a) HUB_L(REMOTE_HUB_ADDR((n), (a)))
284#define REMOTE_HUB_S(n, a, d) HUB_S(REMOTE_HUB_ADDR((n), (a)), (d)) 284#define REMOTE_HUB_S(n, a, d) HUB_S(REMOTE_HUB_ADDR((n), (a)), (d))
285 285
286/*
287 * Coretalk address breakdown
288 */
289#define CTALK_NASID_SHFT 40
290#define CTALK_NASID_MASK (0x3FFFULL << CTALK_NASID_SHFT)
291#define CTALK_CID_SHFT 38
292#define CTALK_CID_MASK (0x3ULL << CTALK_CID_SHFT)
293#define CTALK_NODE_OFFSET 0x3FFFFFFFFF
286 294
287#endif /* _ASM_IA64_SN_ADDRS_H */ 295#endif /* _ASM_IA64_SN_ADDRS_H */
diff --git a/include/asm-ia64/sn/rw_mmr.h b/include/asm-ia64/sn/rw_mmr.h
index f40fd1a5510d..2d78f4c5a45e 100644
--- a/include/asm-ia64/sn/rw_mmr.h
+++ b/include/asm-ia64/sn/rw_mmr.h
@@ -3,15 +3,14 @@
3 * License. See the file "COPYING" in the main directory of this archive 3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details. 4 * for more details.
5 * 5 *
6 * Copyright (C) 2002-2004 Silicon Graphics, Inc. All Rights Reserved. 6 * Copyright (C) 2002-2006 Silicon Graphics, Inc. All Rights Reserved.
7 */ 7 */
8#ifndef _ASM_IA64_SN_RW_MMR_H 8#ifndef _ASM_IA64_SN_RW_MMR_H
9#define _ASM_IA64_SN_RW_MMR_H 9#define _ASM_IA64_SN_RW_MMR_H
10 10
11 11
12/* 12/*
13 * This file contains macros used to access MMR registers via 13 * This file that access MMRs via uncached physical addresses.
14 * uncached physical addresses.
15 * pio_phys_read_mmr - read an MMR 14 * pio_phys_read_mmr - read an MMR
16 * pio_phys_write_mmr - write an MMR 15 * pio_phys_write_mmr - write an MMR
17 * pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0 16 * pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0
@@ -22,53 +21,8 @@
22 */ 21 */
23 22
24 23
25extern inline long 24extern long pio_phys_read_mmr(volatile long *mmr);
26pio_phys_read_mmr(volatile long *mmr) 25extern void pio_phys_write_mmr(volatile long *mmr, long val);
27{ 26extern void pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2);
28 long val;
29 asm volatile
30 ("mov r2=psr;;"
31 "rsm psr.i | psr.dt;;"
32 "srlz.i;;"
33 "ld8.acq %0=[%1];;"
34 "mov psr.l=r2;;"
35 "srlz.i;;"
36 : "=r"(val)
37 : "r"(mmr)
38 : "r2");
39 return val;
40}
41
42
43
44extern inline void
45pio_phys_write_mmr(volatile long *mmr, long val)
46{
47 asm volatile
48 ("mov r2=psr;;"
49 "rsm psr.i | psr.dt;;"
50 "srlz.i;;"
51 "st8.rel [%0]=%1;;"
52 "mov psr.l=r2;;"
53 "srlz.i;;"
54 :: "r"(mmr), "r"(val)
55 : "r2", "memory");
56}
57
58extern inline void
59pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2)
60{
61 asm volatile
62 ("mov r2=psr;;"
63 "rsm psr.i | psr.dt | psr.ic;;"
64 "cmp.ne p9,p0=%2,r0;"
65 "srlz.i;;"
66 "st8.rel [%0]=%1;"
67 "(p9) st8.rel [%2]=%3;;"
68 "mov psr.l=r2;;"
69 "srlz.i;;"
70 :: "r"(mmr1), "r"(val1), "r"(mmr2), "r"(val2)
71 : "p9", "r2", "memory");
72}
73 27
74#endif /* _ASM_IA64_SN_RW_MMR_H */ 28#endif /* _ASM_IA64_SN_RW_MMR_H */
diff --git a/include/asm-ia64/sn/tioce.h b/include/asm-ia64/sn/tioce.h
index d4c990712eac..893468e1b41b 100644
--- a/include/asm-ia64/sn/tioce.h
+++ b/include/asm-ia64/sn/tioce.h
@@ -11,7 +11,7 @@
11 11
12/* CE ASIC part & mfgr information */ 12/* CE ASIC part & mfgr information */
13#define TIOCE_PART_NUM 0xCE00 13#define TIOCE_PART_NUM 0xCE00
14#define TIOCE_MFGR_NUM 0x36 14#define TIOCE_SRC_ID 0x01
15#define TIOCE_REV_A 0x1 15#define TIOCE_REV_A 0x1
16 16
17/* CE Virtual PPB Vendor/Device IDs */ 17/* CE Virtual PPB Vendor/Device IDs */
@@ -20,7 +20,7 @@
20 20
21/* CE Host Bridge Vendor/Device IDs */ 21/* CE Host Bridge Vendor/Device IDs */
22#define CE_HOST_BRIDGE_VENDOR_ID 0x10a9 22#define CE_HOST_BRIDGE_VENDOR_ID 0x10a9
23#define CE_HOST_BRIDGE_DEVICE_ID 0x4003 23#define CE_HOST_BRIDGE_DEVICE_ID 0x4001
24 24
25 25
26#define TIOCE_NUM_M40_ATES 4096 26#define TIOCE_NUM_M40_ATES 4096
@@ -463,6 +463,25 @@ typedef volatile struct tioce {
463 u64 ce_end_of_struct; /* 0x044400 */ 463 u64 ce_end_of_struct; /* 0x044400 */
464} tioce_t; 464} tioce_t;
465 465
466/* ce_lsiX_gb_cfg1 register bit masks & shifts */
467#define CE_LSI_GB_CFG1_RXL0S_THS_SHFT 0
468#define CE_LSI_GB_CFG1_RXL0S_THS_MASK (0xffULL << 0)
469#define CE_LSI_GB_CFG1_RXL0S_SMP_SHFT 8
470#define CE_LSI_GB_CFG1_RXL0S_SMP_MASK (0xfULL << 8);
471#define CE_LSI_GB_CFG1_RXL0S_ADJ_SHFT 12
472#define CE_LSI_GB_CFG1_RXL0S_ADJ_MASK (0x7ULL << 12)
473#define CE_LSI_GB_CFG1_RXL0S_FLT_SHFT 15
474#define CE_LSI_GB_CFG1_RXL0S_FLT_MASK (0x1ULL << 15)
475#define CE_LSI_GB_CFG1_LPBK_SEL_SHFT 16
476#define CE_LSI_GB_CFG1_LPBK_SEL_MASK (0x3ULL << 16)
477#define CE_LSI_GB_CFG1_LPBK_EN_SHFT 18
478#define CE_LSI_GB_CFG1_LPBK_EN_MASK (0x1ULL << 18)
479#define CE_LSI_GB_CFG1_RVRS_LB_SHFT 19
480#define CE_LSI_GB_CFG1_RVRS_LB_MASK (0x1ULL << 19)
481#define CE_LSI_GB_CFG1_RVRS_CLK_SHFT 20
482#define CE_LSI_GB_CFG1_RVRS_CLK_MASK (0x3ULL << 20)
483#define CE_LSI_GB_CFG1_SLF_TS_SHFT 24
484#define CE_LSI_GB_CFG1_SLF_TS_MASK (0xfULL << 24)
466 485
467/* ce_adm_int_mask/ce_adm_int_status register bit defines */ 486/* ce_adm_int_mask/ce_adm_int_status register bit defines */
468#define CE_ADM_INT_CE_ERROR_SHFT 0 487#define CE_ADM_INT_CE_ERROR_SHFT 0
@@ -592,6 +611,11 @@ typedef volatile struct tioce {
592#define CE_URE_RD_MRG_ENABLE (0x1ULL << 0) 611#define CE_URE_RD_MRG_ENABLE (0x1ULL << 0)
593#define CE_URE_WRT_MRG_ENABLE1 (0x1ULL << 4) 612#define CE_URE_WRT_MRG_ENABLE1 (0x1ULL << 4)
594#define CE_URE_WRT_MRG_ENABLE2 (0x1ULL << 5) 613#define CE_URE_WRT_MRG_ENABLE2 (0x1ULL << 5)
614#define CE_URE_WRT_MRG_TIMER_SHFT 12
615#define CE_URE_WRT_MRG_TIMER_MASK (0x7FFULL << CE_URE_WRT_MRG_TIMER_SHFT)
616#define CE_URE_WRT_MRG_TIMER(x) (((u64)(x) << \
617 CE_URE_WRT_MRG_TIMER_SHFT) & \
618 CE_URE_WRT_MRG_TIMER_MASK)
595#define CE_URE_RSPQ_BYPASS_DISABLE (0x1ULL << 24) 619#define CE_URE_RSPQ_BYPASS_DISABLE (0x1ULL << 24)
596#define CE_URE_UPS_DAT1_PAR_DISABLE (0x1ULL << 32) 620#define CE_URE_UPS_DAT1_PAR_DISABLE (0x1ULL << 32)
597#define CE_URE_UPS_HDR1_PAR_DISABLE (0x1ULL << 33) 621#define CE_URE_UPS_HDR1_PAR_DISABLE (0x1ULL << 33)
@@ -653,8 +677,12 @@ typedef volatile struct tioce {
653#define CE_URE_SI (0x1ULL << 0) 677#define CE_URE_SI (0x1ULL << 0)
654#define CE_URE_ELAL_SHFT 4 678#define CE_URE_ELAL_SHFT 4
655#define CE_URE_ELAL_MASK (0x7ULL << CE_URE_ELAL_SHFT) 679#define CE_URE_ELAL_MASK (0x7ULL << CE_URE_ELAL_SHFT)
680#define CE_URE_ELAL_SET(n) (((u64)(n) << CE_URE_ELAL_SHFT) & \
681 CE_URE_ELAL_MASK)
656#define CE_URE_ELAL1_SHFT 8 682#define CE_URE_ELAL1_SHFT 8
657#define CE_URE_ELAL1_MASK (0x7ULL << CE_URE_ELAL1_SHFT) 683#define CE_URE_ELAL1_MASK (0x7ULL << CE_URE_ELAL1_SHFT)
684#define CE_URE_ELAL1_SET(n) (((u64)(n) << CE_URE_ELAL1_SHFT) & \
685 CE_URE_ELAL1_MASK)
658#define CE_URE_SCC (0x1ULL << 12) 686#define CE_URE_SCC (0x1ULL << 12)
659#define CE_URE_PN1_SHFT 16 687#define CE_URE_PN1_SHFT 16
660#define CE_URE_PN1_MASK (0xFFULL << CE_URE_PN1_SHFT) 688#define CE_URE_PN1_MASK (0xFFULL << CE_URE_PN1_SHFT)
@@ -675,8 +703,12 @@ typedef volatile struct tioce {
675#define CE_URE_HPC (0x1ULL << 6) 703#define CE_URE_HPC (0x1ULL << 6)
676#define CE_URE_SPLV_SHFT 7 704#define CE_URE_SPLV_SHFT 7
677#define CE_URE_SPLV_MASK (0xFFULL << CE_URE_SPLV_SHFT) 705#define CE_URE_SPLV_MASK (0xFFULL << CE_URE_SPLV_SHFT)
706#define CE_URE_SPLV_SET(n) (((u64)(n) << CE_URE_SPLV_SHFT) & \
707 CE_URE_SPLV_MASK)
678#define CE_URE_SPLS_SHFT 15 708#define CE_URE_SPLS_SHFT 15
679#define CE_URE_SPLS_MASK (0x3ULL << CE_URE_SPLS_SHFT) 709#define CE_URE_SPLS_MASK (0x3ULL << CE_URE_SPLS_SHFT)
710#define CE_URE_SPLS_SET(n) (((u64)(n) << CE_URE_SPLS_SHFT) & \
711 CE_URE_SPLS_MASK)
680#define CE_URE_PSN1_SHFT 19 712#define CE_URE_PSN1_SHFT 19
681#define CE_URE_PSN1_MASK (0x1FFFULL << CE_URE_PSN1_SHFT) 713#define CE_URE_PSN1_MASK (0x1FFFULL << CE_URE_PSN1_SHFT)
682#define CE_URE_PSN2_SHFT 32 714#define CE_URE_PSN2_SHFT 32
diff --git a/include/asm-ia64/sn/xpc.h b/include/asm-ia64/sn/xpc.h
index df7f5f4f3cde..aa3b8ace9030 100644
--- a/include/asm-ia64/sn/xpc.h
+++ b/include/asm-ia64/sn/xpc.h
@@ -1227,28 +1227,6 @@ xpc_map_bte_errors(bte_result_t error)
1227 1227
1228 1228
1229 1229
1230static inline void *
1231xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
1232{
1233 /* see if kmalloc will give us cachline aligned memory by default */
1234 *base = kmalloc(size, flags);
1235 if (*base == NULL) {
1236 return NULL;
1237 }
1238 if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
1239 return *base;
1240 }
1241 kfree(*base);
1242
1243 /* nope, we'll have to do it ourselves */
1244 *base = kmalloc(size + L1_CACHE_BYTES, flags);
1245 if (*base == NULL) {
1246 return NULL;
1247 }
1248 return (void *) L1_CACHE_ALIGN((u64) *base);
1249}
1250
1251
1252/* 1230/*
1253 * Check to see if there is any channel activity to/from the specified 1231 * Check to see if there is any channel activity to/from the specified
1254 * partition. 1232 * partition.
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index 062538715623..cd4233d66f15 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -244,6 +244,13 @@ extern void ia64_load_extra (struct task_struct *task);
244 __ia64_save_fpu((prev)->thread.fph); \ 244 __ia64_save_fpu((prev)->thread.fph); \
245 } \ 245 } \
246 __switch_to(prev, next, last); \ 246 __switch_to(prev, next, last); \
247 /* "next" in old context is "current" in new context */ \
248 if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) && \
249 (task_cpu(current) != \
250 task_thread_info(current)->last_cpu))) { \
251 platform_migrate(current); \
252 task_thread_info(current)->last_cpu = task_cpu(current); \
253 } \
247} while (0) 254} while (0)
248#else 255#else
249# define switch_to(prev,next,last) __switch_to(prev, next, last) 256# define switch_to(prev,next,last) __switch_to(prev, next, last)
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 1d6518fe1f02..56394a2c7055 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -26,16 +26,10 @@ struct thread_info {
26 struct exec_domain *exec_domain;/* execution domain */ 26 struct exec_domain *exec_domain;/* execution domain */
27 __u32 flags; /* thread_info flags (see TIF_*) */ 27 __u32 flags; /* thread_info flags (see TIF_*) */
28 __u32 cpu; /* current CPU */ 28 __u32 cpu; /* current CPU */
29 __u32 last_cpu; /* Last CPU thread ran on */
29 mm_segment_t addr_limit; /* user-level address space limit */ 30 mm_segment_t addr_limit; /* user-level address space limit */
30 int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ 31 int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */
31 struct restart_block restart_block; 32 struct restart_block restart_block;
32 struct {
33 int signo;
34 int code;
35 void __user *addr;
36 unsigned long start_time;
37 pid_t pid;
38 } sigdelayed; /* Saved information for TIF_SIGDELAYED */
39}; 33};
40 34
41#define THREAD_SIZE KERNEL_STACK_SIZE 35#define THREAD_SIZE KERNEL_STACK_SIZE
@@ -89,7 +83,6 @@ struct thread_info {
89#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ 83#define TIF_NEED_RESCHED 2 /* rescheduling necessary */
90#define TIF_SYSCALL_TRACE 3 /* syscall trace active */ 84#define TIF_SYSCALL_TRACE 3 /* syscall trace active */
91#define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ 85#define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */
92#define TIF_SIGDELAYED 5 /* signal delayed from MCA/INIT/NMI/PMI context */
93#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ 86#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
94#define TIF_MEMDIE 17 87#define TIF_MEMDIE 17
95#define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ 88#define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */
@@ -101,13 +94,12 @@ struct thread_info {
101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 94#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
102#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 95#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
103#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 96#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
104#define _TIF_SIGDELAYED (1 << TIF_SIGDELAYED)
105#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) 97#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
106#define _TIF_MCA_INIT (1 << TIF_MCA_INIT) 98#define _TIF_MCA_INIT (1 << TIF_MCA_INIT)
107#define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) 99#define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED)
108 100
109/* "work to do on user-return" bits */ 101/* "work to do on user-return" bits */
110#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SIGDELAYED) 102#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
111/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ 103/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
112#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) 104#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
113 105
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index e38931379a72..185ee15963a1 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[];
468 468
469extern void paging_init(void); 469extern void paging_init(void);
470 470
471#ifdef CONFIG_HUGETLB_PAGE
472#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
473 free_pgd_range(tlb, addr, end, floor, ceiling)
474#endif
475
476/* 471/*
477 * This gets called at the end of handling a page fault, when 472 * This gets called at the end of handling a page fault, when
478 * the kernel has put a new PTE into the page table for the process. 473 * the kernel has put a new PTE into the page table for the process.
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index 3417dd71ab43..e28aaf28e4a8 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -158,11 +158,4 @@ static inline void pte_free(struct page *pte)
158 158
159#define __pte_free_tlb(tlb,pte) tlb_remove_page(tlb,pte) 159#define __pte_free_tlb(tlb,pte) tlb_remove_page(tlb,pte)
160 160
161/*
162 * This establishes kernel virtual mappings (e.g., as a result of a
163 * vmalloc call). Since s390-esame uses a separate kernel page table,
164 * there is nothing to do here... :)
165 */
166#define set_pgdir(addr,entry) do { } while(0)
167
168#endif /* _S390_PGALLOC_H */ 161#endif /* _S390_PGALLOC_H */
diff --git a/include/asm-sh64/pgalloc.h b/include/asm-sh64/pgalloc.h
index 678251ac1db8..b29dd468817e 100644
--- a/include/asm-sh64/pgalloc.h
+++ b/include/asm-sh64/pgalloc.h
@@ -167,22 +167,6 @@ static __inline__ void pmd_free(pmd_t *pmd)
167 167
168extern int do_check_pgt_cache(int, int); 168extern int do_check_pgt_cache(int, int);
169 169
170static inline void set_pgdir(unsigned long address, pgd_t entry)
171{
172 struct task_struct * p;
173 pgd_t *pgd;
174
175 read_lock(&tasklist_lock);
176 for_each_process(p) {
177 if (!p->mm)
178 continue;
179 *pgd_offset(p->mm,address) = entry;
180 }
181 read_unlock(&tasklist_lock);
182 for (pgd = (pgd_t *)pgd_quicklist; pgd; pgd = (pgd_t *)*(unsigned long *)pgd)
183 pgd[address >> PGDIR_SHIFT] = entry;
184}
185
186#define pmd_populate_kernel(mm, pmd, pte) \ 170#define pmd_populate_kernel(mm, pmd, pte) \
187 set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) (pte))) 171 set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) (pte)))
188 172
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 715fd94cf577..a617d364d08d 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
273static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } 273static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
274static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } 274static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; }
275static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } 275static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
276static inline int pte_huge(pte_t pte) { return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; } 276static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; }
277 277
278static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } 278static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
279static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } 279static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
@@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _
285static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } 285static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
286static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } 286static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
287static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } 287static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
288static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; } 288static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
289 289
290struct vm_area_struct; 290struct vm_area_struct;
291 291
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68d82ad6b17c..d6f1019625af 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,10 +20,7 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long)
20int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 20int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
21int hugetlb_report_meminfo(char *); 21int hugetlb_report_meminfo(char *);
22int hugetlb_report_node_meminfo(int, char *); 22int hugetlb_report_node_meminfo(int, char *);
23int is_hugepage_mem_enough(size_t);
24unsigned long hugetlb_total_pages(void); 23unsigned long hugetlb_total_pages(void);
25struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
26void free_huge_page(struct page *);
27int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 24int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
28 unsigned long address, int write_access); 25 unsigned long address, int write_access);
29 26
@@ -39,18 +36,35 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
39 int write); 36 int write);
40struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 37struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
41 pmd_t *pmd, int write); 38 pmd_t *pmd, int write);
42int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
43int pmd_huge(pmd_t pmd); 39int pmd_huge(pmd_t pmd);
40void hugetlb_change_protection(struct vm_area_struct *vma,
41 unsigned long address, unsigned long end, pgprot_t newprot);
44 42
45#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE 43#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
46#define is_hugepage_only_range(mm, addr, len) 0 44#define is_hugepage_only_range(mm, addr, len) 0
47#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ 45#endif
48 do { } while (0) 46
47#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
48#define hugetlb_free_pgd_range free_pgd_range
49#else
50void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
51 unsigned long end, unsigned long floor,
52 unsigned long ceiling);
49#endif 53#endif
50 54
51#ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE 55#ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
52#define prepare_hugepage_range(addr, len) \ 56/*
53 is_aligned_hugepage_range(addr, len) 57 * If the arch doesn't supply something else, assume that hugepage
58 * size aligned regions are ok without further preparation.
59 */
60static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
61{
62 if (len & ~HPAGE_MASK)
63 return -EINVAL;
64 if (addr & ~HPAGE_MASK)
65 return -EINVAL;
66 return 0;
67}
54#else 68#else
55int prepare_hugepage_range(unsigned long addr, unsigned long len); 69int prepare_hugepage_range(unsigned long addr, unsigned long len);
56#endif 70#endif
@@ -87,20 +101,17 @@ static inline unsigned long hugetlb_total_pages(void)
87#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) 101#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
88#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) 102#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
89#define unmap_hugepage_range(vma, start, end) BUG() 103#define unmap_hugepage_range(vma, start, end) BUG()
90#define is_hugepage_mem_enough(size) 0
91#define hugetlb_report_meminfo(buf) 0 104#define hugetlb_report_meminfo(buf) 0
92#define hugetlb_report_node_meminfo(n, buf) 0 105#define hugetlb_report_node_meminfo(n, buf) 0
93#define follow_huge_pmd(mm, addr, pmd, write) NULL 106#define follow_huge_pmd(mm, addr, pmd, write) NULL
94#define is_aligned_hugepage_range(addr, len) 0
95#define prepare_hugepage_range(addr, len) (-EINVAL) 107#define prepare_hugepage_range(addr, len) (-EINVAL)
96#define pmd_huge(x) 0 108#define pmd_huge(x) 0
97#define is_hugepage_only_range(mm, addr, len) 0 109#define is_hugepage_only_range(mm, addr, len) 0
98#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ 110#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
99 do { } while (0)
100#define alloc_huge_page(vma, addr) ({ NULL; })
101#define free_huge_page(p) ({ (void)(p); BUG(); })
102#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) 111#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103 112
113#define hugetlb_change_protection(vma, address, end, newprot)
114
104#ifndef HPAGE_MASK 115#ifndef HPAGE_MASK
105#define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */ 116#define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */
106#define HPAGE_SIZE PAGE_SIZE 117#define HPAGE_SIZE PAGE_SIZE
@@ -128,6 +139,8 @@ struct hugetlbfs_sb_info {
128 139
129struct hugetlbfs_inode_info { 140struct hugetlbfs_inode_info {
130 struct shared_policy policy; 141 struct shared_policy policy;
142 /* Protected by the (global) hugetlb_lock */
143 unsigned long prereserved_hpages;
131 struct inode vfs_inode; 144 struct inode vfs_inode;
132}; 145};
133 146
@@ -144,6 +157,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
144extern struct file_operations hugetlbfs_file_operations; 157extern struct file_operations hugetlbfs_file_operations;
145extern struct vm_operations_struct hugetlb_vm_ops; 158extern struct vm_operations_struct hugetlb_vm_ops;
146struct file *hugetlb_zero_setup(size_t); 159struct file *hugetlb_zero_setup(size_t);
160int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
161 unsigned long atleast_hpages);
162void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
163 unsigned long atmost_hpages);
147int hugetlb_get_quota(struct address_space *mapping); 164int hugetlb_get_quota(struct address_space *mapping);
148void hugetlb_put_quota(struct address_space *mapping); 165void hugetlb_put_quota(struct address_space *mapping);
149 166
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
new file mode 100644
index 000000000000..7d09962c3c0b
--- /dev/null
+++ b/include/linux/migrate.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_MIGRATE_H
2#define _LINUX_MIGRATE_H
3
4#include <linux/config.h>
5#include <linux/mm.h>
6
7#ifdef CONFIG_MIGRATION
8extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
9extern int putback_lru_pages(struct list_head *l);
10extern int migrate_page(struct page *, struct page *);
11extern void migrate_page_copy(struct page *, struct page *);
12extern int migrate_page_remove_references(struct page *, struct page *, int);
13extern int migrate_pages(struct list_head *l, struct list_head *t,
14 struct list_head *moved, struct list_head *failed);
15int migrate_pages_to(struct list_head *pagelist,
16 struct vm_area_struct *vma, int dest);
17extern int fail_migrate_page(struct page *, struct page *);
18
19extern int migrate_prep(void);
20
21#else
22
23static inline int isolate_lru_page(struct page *p, struct list_head *list)
24 { return -ENOSYS; }
25static inline int putback_lru_pages(struct list_head *l) { return 0; }
26static inline int migrate_pages(struct list_head *l, struct list_head *t,
27 struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
28
29static inline int migrate_prep(void) { return -ENOSYS; }
30
31/* Possible settings for the migrate_page() method in address_operations */
32#define migrate_page NULL
33#define fail_migrate_page NULL
34
35#endif /* CONFIG_MIGRATION */
36#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 498ff8778fb6..6aa016f1d3ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -286,43 +286,34 @@ struct page {
286 * 286 *
287 * Also, many kernel routines increase the page count before a critical 287 * Also, many kernel routines increase the page count before a critical
288 * routine so they can be sure the page doesn't go away from under them. 288 * routine so they can be sure the page doesn't go away from under them.
289 *
290 * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we
291 * can use atomic_add_negative(-1, page->_count) to detect when the page
292 * becomes free and so that we can also use atomic_inc_and_test to atomically
293 * detect when we just tried to grab a ref on a page which some other CPU has
294 * already deemed to be freeable.
295 *
296 * NO code should make assumptions about this internal detail! Use the provided
297 * macros which retain the old rules: page_count(page) == 0 is a free page.
298 */ 289 */
299 290
300/* 291/*
301 * Drop a ref, return true if the logical refcount fell to zero (the page has 292 * Drop a ref, return true if the logical refcount fell to zero (the page has
302 * no users) 293 * no users)
303 */ 294 */
304#define put_page_testzero(p) \ 295static inline int put_page_testzero(struct page *page)
305 ({ \ 296{
306 BUG_ON(atomic_read(&(p)->_count) == -1);\ 297 BUG_ON(atomic_read(&page->_count) == 0);
307 atomic_add_negative(-1, &(p)->_count); \ 298 return atomic_dec_and_test(&page->_count);
308 }) 299}
309 300
310/* 301/*
311 * Grab a ref, return true if the page previously had a logical refcount of 302 * Try to grab a ref unless the page has a refcount of zero, return false if
312 * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page 303 * that is the case.
313 */ 304 */
314#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) 305static inline int get_page_unless_zero(struct page *page)
315 306{
316#define set_page_count(p,v) atomic_set(&(p)->_count, (v) - 1) 307 return atomic_inc_not_zero(&page->_count);
317#define __put_page(p) atomic_dec(&(p)->_count) 308}
318 309
319extern void FASTCALL(__page_cache_release(struct page *)); 310extern void FASTCALL(__page_cache_release(struct page *));
320 311
321static inline int page_count(struct page *page) 312static inline int page_count(struct page *page)
322{ 313{
323 if (PageCompound(page)) 314 if (unlikely(PageCompound(page)))
324 page = (struct page *)page_private(page); 315 page = (struct page *)page_private(page);
325 return atomic_read(&page->_count) + 1; 316 return atomic_read(&page->_count);
326} 317}
327 318
328static inline void get_page(struct page *page) 319static inline void get_page(struct page *page)
@@ -332,8 +323,19 @@ static inline void get_page(struct page *page)
332 atomic_inc(&page->_count); 323 atomic_inc(&page->_count);
333} 324}
334 325
326/*
327 * Setup the page count before being freed into the page allocator for
328 * the first time (boot or memory hotplug)
329 */
330static inline void init_page_count(struct page *page)
331{
332 atomic_set(&page->_count, 1);
333}
334
335void put_page(struct page *page); 335void put_page(struct page *page);
336 336
337void split_page(struct page *page, unsigned int order);
338
337/* 339/*
338 * Multiple processes may "see" the same page. E.g. for untouched 340 * Multiple processes may "see" the same page. E.g. for untouched
339 * mappings of /dev/null, all processes see the same page full of 341 * mappings of /dev/null, all processes see the same page full of
@@ -1046,7 +1048,7 @@ int in_gate_area_no_task(unsigned long addr);
1046 1048
1047int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, 1049int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
1048 void __user *, size_t *, loff_t *); 1050 void __user *, size_t *, loff_t *);
1049int shrink_slab(unsigned long scanned, gfp_t gfp_mask, 1051unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
1050 unsigned long lru_pages); 1052 unsigned long lru_pages);
1051void drop_pagecache(void); 1053void drop_pagecache(void);
1052void drop_slab(void); 1054void drop_slab(void);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8ac854f7f190..3b6723dfaff3 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
32{ 32{
33 list_del(&page->lru); 33 list_del(&page->lru);
34 if (PageActive(page)) { 34 if (PageActive(page)) {
35 ClearPageActive(page); 35 __ClearPageActive(page);
36 zone->nr_active--; 36 zone->nr_active--;
37 } else { 37 } else {
38 zone->nr_inactive--; 38 zone->nr_inactive--;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d52999c43336..9ea629c02a4b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -86,8 +86,9 @@
86 * - The __xxx_page_state variants can be used safely when interrupts are 86 * - The __xxx_page_state variants can be used safely when interrupts are
87 * disabled. 87 * disabled.
88 * - The __xxx_page_state variants can be used if the field is only 88 * - The __xxx_page_state variants can be used if the field is only
89 * modified from process context, or only modified from interrupt context. 89 * modified from process context and protected from preemption, or only
90 * In this case, the field should be commented here. 90 * modified from interrupt context. In this case, the field should be
91 * commented here.
91 */ 92 */
92struct page_state { 93struct page_state {
93 unsigned long nr_dirty; /* Dirty writeable pages */ 94 unsigned long nr_dirty; /* Dirty writeable pages */
@@ -239,22 +240,19 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
239#define __ClearPageDirty(page) __clear_bit(PG_dirty, &(page)->flags) 240#define __ClearPageDirty(page) __clear_bit(PG_dirty, &(page)->flags)
240#define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags) 241#define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags)
241 242
242#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags)
243#define PageLRU(page) test_bit(PG_lru, &(page)->flags) 243#define PageLRU(page) test_bit(PG_lru, &(page)->flags)
244#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) 244#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags)
245#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) 245#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags)
246#define __ClearPageLRU(page) __clear_bit(PG_lru, &(page)->flags)
246 247
247#define PageActive(page) test_bit(PG_active, &(page)->flags) 248#define PageActive(page) test_bit(PG_active, &(page)->flags)
248#define SetPageActive(page) set_bit(PG_active, &(page)->flags) 249#define SetPageActive(page) set_bit(PG_active, &(page)->flags)
249#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) 250#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags)
250#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) 251#define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags)
251#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
252 252
253#define PageSlab(page) test_bit(PG_slab, &(page)->flags) 253#define PageSlab(page) test_bit(PG_slab, &(page)->flags)
254#define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) 254#define __SetPageSlab(page) __set_bit(PG_slab, &(page)->flags)
255#define ClearPageSlab(page) clear_bit(PG_slab, &(page)->flags) 255#define __ClearPageSlab(page) __clear_bit(PG_slab, &(page)->flags)
256#define TestClearPageSlab(page) test_and_clear_bit(PG_slab, &(page)->flags)
257#define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags)
258 256
259#ifdef CONFIG_HIGHMEM 257#ifdef CONFIG_HIGHMEM
260#define PageHighMem(page) is_highmem(page_zone(page)) 258#define PageHighMem(page) is_highmem(page_zone(page))
@@ -329,8 +327,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
329#define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags) 327#define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
330 328
331#define PageCompound(page) test_bit(PG_compound, &(page)->flags) 329#define PageCompound(page) test_bit(PG_compound, &(page)->flags)
332#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) 330#define __SetPageCompound(page) __set_bit(PG_compound, &(page)->flags)
333#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) 331#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags)
334 332
335#ifdef CONFIG_SWAP 333#ifdef CONFIG_SWAP
336#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) 334#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags)
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0b2ba67ff13c..b739ac1f7ca0 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -11,8 +11,6 @@
11#ifndef _LINUX_RTC_H_ 11#ifndef _LINUX_RTC_H_
12#define _LINUX_RTC_H_ 12#define _LINUX_RTC_H_
13 13
14#include <linux/interrupt.h>
15
16/* 14/*
17 * The struct used to pass data via the following ioctl. Similar to the 15 * The struct used to pass data via the following ioctl. Similar to the
18 * struct tm in <time.h>, but it needs to be here so that the kernel 16 * struct tm in <time.h>, but it needs to be here so that the kernel
@@ -95,6 +93,8 @@ struct rtc_pll_info {
95 93
96#ifdef __KERNEL__ 94#ifdef __KERNEL__
97 95
96#include <linux/interrupt.h>
97
98typedef struct rtc_task { 98typedef struct rtc_task {
99 void (*func)(void *private_data); 99 void (*func)(void *private_data);
100 void *private_data; 100 void *private_data;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8cf52939d0ab..2b28c849d75a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t;
38#define SLAB_DEBUG_INITIAL 0x00000200UL /* Call constructor (as verifier) */ 38#define SLAB_DEBUG_INITIAL 0x00000200UL /* Call constructor (as verifier) */
39#define SLAB_RED_ZONE 0x00000400UL /* Red zone objs in a cache */ 39#define SLAB_RED_ZONE 0x00000400UL /* Red zone objs in a cache */
40#define SLAB_POISON 0x00000800UL /* Poison objects */ 40#define SLAB_POISON 0x00000800UL /* Poison objects */
41#define SLAB_NO_REAP 0x00001000UL /* never reap from the cache */
42#define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */ 41#define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */
43#define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */ 42#define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */
44#define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */ 43#define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */
@@ -118,7 +117,7 @@ extern void *kzalloc(size_t, gfp_t);
118 */ 117 */
119static inline void *kcalloc(size_t n, size_t size, gfp_t flags) 118static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
120{ 119{
121 if (n != 0 && size > INT_MAX / n) 120 if (n != 0 && size > ULONG_MAX / n)
122 return NULL; 121 return NULL;
123 return kzalloc(n * size, flags); 122 return kzalloc(n * size, flags);
124} 123}
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 44153fdf73fc..d699a16b0cb2 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus);
52/* 52/*
53 * Call a function on all other processors 53 * Call a function on all other processors
54 */ 54 */
55extern int smp_call_function (void (*func) (void *info), void *info, 55int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
56 int retry, int wait);
57 56
58/* 57/*
59 * Call a function on all processors 58 * Call a function on all processors
60 */ 59 */
61static inline int on_each_cpu(void (*func) (void *info), void *info, 60int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
62 int retry, int wait)
63{
64 int ret = 0;
65
66 preempt_disable();
67 ret = smp_call_function(func, info, retry, wait);
68 func(info);
69 preempt_enable();
70 return ret;
71}
72 61
73#define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */ 62#define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */
74#define MSG_ALL 0x8001 63#define MSG_ALL 0x8001
@@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void);
94#define raw_smp_processor_id() 0 83#define raw_smp_processor_id() 0
95#define hard_smp_processor_id() 0 84#define hard_smp_processor_id() 0
96#define smp_call_function(func,info,retry,wait) ({ 0; }) 85#define smp_call_function(func,info,retry,wait) ({ 0; })
97#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) 86#define on_each_cpu(func,info,retry,wait) \
87 ({ \
88 local_irq_disable(); \
89 func(info); \
90 local_irq_enable(); \
91 0; \
92 })
98static inline void smp_send_reschedule(int cpu) { } 93static inline void smp_send_reschedule(int cpu) { }
99#define num_booting_cpus() 1 94#define num_booting_cpus() 1
100#define smp_prepare_boot_cpu() do {} while (0) 95#define smp_prepare_boot_cpu() do {} while (0)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d572b19afb7d..12415dd94451 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,9 +172,24 @@ extern int rotate_reclaimable_page(struct page *page);
172extern void swap_setup(void); 172extern void swap_setup(void);
173 173
174/* linux/mm/vmscan.c */ 174/* linux/mm/vmscan.c */
175extern int try_to_free_pages(struct zone **, gfp_t); 175extern unsigned long try_to_free_pages(struct zone **, gfp_t);
176extern int shrink_all_memory(int); 176extern unsigned long shrink_all_memory(unsigned long nr_pages);
177extern int vm_swappiness; 177extern int vm_swappiness;
178extern int remove_mapping(struct address_space *mapping, struct page *page);
179
180/* possible outcome of pageout() */
181typedef enum {
182 /* failed to write page out, page is locked */
183 PAGE_KEEP,
184 /* move page to the active list, page is locked */
185 PAGE_ACTIVATE,
186 /* page has been sent to the disk successfully, page is unlocked */
187 PAGE_SUCCESS,
188 /* page is clean and locked */
189 PAGE_CLEAN,
190} pageout_t;
191
192extern pageout_t pageout(struct page *page, struct address_space *mapping);
178 193
179#ifdef CONFIG_NUMA 194#ifdef CONFIG_NUMA
180extern int zone_reclaim_mode; 195extern int zone_reclaim_mode;
@@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
188} 203}
189#endif 204#endif
190 205
191#ifdef CONFIG_MIGRATION
192extern int isolate_lru_page(struct page *p);
193extern int putback_lru_pages(struct list_head *l);
194extern int migrate_page(struct page *, struct page *);
195extern void migrate_page_copy(struct page *, struct page *);
196extern int migrate_page_remove_references(struct page *, struct page *, int);
197extern int migrate_pages(struct list_head *l, struct list_head *t,
198 struct list_head *moved, struct list_head *failed);
199extern int fail_migrate_page(struct page *, struct page *);
200#else
201static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
202static inline int putback_lru_pages(struct list_head *l) { return 0; }
203static inline int migrate_pages(struct list_head *l, struct list_head *t,
204 struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
205/* Possible settings for the migrate_page() method in address_operations */
206#define migrate_page NULL
207#define fail_migrate_page NULL
208#endif
209
210#ifdef CONFIG_MMU 206#ifdef CONFIG_MMU
211/* linux/mm/shmem.c */ 207/* linux/mm/shmem.c */
212extern int shmem_unuse(swp_entry_t entry, struct page *page); 208extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca497..9bd7b65ee418 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1534,6 +1534,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1534 1534
1535 check_unshare_flags(&unshare_flags); 1535 check_unshare_flags(&unshare_flags);
1536 1536
1537 /* Return -EINVAL for all unsupported flags */
1538 err = -EINVAL;
1539 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1540 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
1541 goto bad_unshare_out;
1542
1537 if ((err = unshare_thread(unshare_flags))) 1543 if ((err = unshare_thread(unshare_flags)))
1538 goto bad_unshare_out; 1544 goto bad_unshare_out;
1539 if ((err = unshare_fs(unshare_flags, &new_fs))) 1545 if ((err = unshare_fs(unshare_flags, &new_fs)))
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d46e90f59c3..6b6e0d70eb30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -707,12 +707,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
707 DEF_TIMESLICE); 707 DEF_TIMESLICE);
708 } else { 708 } else {
709 /* 709 /*
710 * The lower the sleep avg a task has the more
711 * rapidly it will rise with sleep time.
712 */
713 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
714
715 /*
716 * Tasks waking from uninterruptible sleep are 710 * Tasks waking from uninterruptible sleep are
717 * limited in their sleep_avg rise as they 711 * limited in their sleep_avg rise as they
718 * are likely to be waiting on I/O 712 * are likely to be waiting on I/O
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdded5..ec8fed42a86f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h>
19 20
20#include <asm/irq.h> 21#include <asm/irq.h>
21/* 22/*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
495 register_cpu_notifier(&cpu_nfb); 496 register_cpu_notifier(&cpu_nfb);
496 return 0; 497 return 0;
497} 498}
499
500#ifdef CONFIG_SMP
501/*
502 * Call a function on all processors
503 */
504int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
505{
506 int ret = 0;
507
508 preempt_disable();
509 ret = smp_call_function(func, info, retry, wait);
510 local_irq_disable();
511 func(info);
512 local_irq_enable();
513 preempt_enable();
514 return ret;
515}
516EXPORT_SYMBOL(on_each_cpu);
517#endif
diff --git a/lib/string.c b/lib/string.c
index 037a48acedbb..b3c28a3f6332 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -403,7 +403,6 @@ char *strpbrk(const char *cs, const char *ct)
403 } 403 }
404 return NULL; 404 return NULL;
405} 405}
406EXPORT_SYMBOL(strpbrk);
407#endif 406#endif
408 407
409#ifndef __HAVE_ARCH_STRSEP 408#ifndef __HAVE_ARCH_STRSEP
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae6409..bd80460360db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
137# support for page migration 137# support for page migration
138# 138#
139config MIGRATION 139config MIGRATION
140 bool "Page migration"
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM 141 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
141 depends on SWAP 142 depends on SWAP
143 help
144 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for
146 example on NUMA systems to put pages nearer to the processors accessing
147 the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc3..f10c753dce6d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 22obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o
26
diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d476994..e8f58f7dd7a5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include "filemap.h" 32#include "filemap.h"
33#include "internal.h"
34
33/* 35/*
34 * FIXME: remove all knowledge of the buffer layer from the core VM 36 * FIXME: remove all knowledge of the buffer layer from the core VM
35 */ 37 */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h>
16 17
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
20#include <linux/hugetlb.h> 21#include <linux/hugetlb.h>
22#include "internal.h"
21 23
22const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
24unsigned long max_huge_pages; 26unsigned long max_huge_pages;
25static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
26static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27static unsigned int free_huge_pages_node[MAX_NUMNODES]; 29static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29/* 30/*
30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 31 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31 */ 32 */
32static DEFINE_SPINLOCK(hugetlb_lock); 33static DEFINE_SPINLOCK(hugetlb_lock);
33 34
35static void clear_huge_page(struct page *page, unsigned long addr)
36{
37 int i;
38
39 might_sleep();
40 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
41 cond_resched();
42 clear_user_highpage(page + i, addr);
43 }
44}
45
46static void copy_huge_page(struct page *dst, struct page *src,
47 unsigned long addr)
48{
49 int i;
50
51 might_sleep();
52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 cond_resched();
54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
55 }
56}
57
34static void enqueue_huge_page(struct page *page) 58static void enqueue_huge_page(struct page *page)
35{ 59{
36 int nid = page_to_nid(page); 60 int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
64 return page; 88 return page;
65} 89}
66 90
67static struct page *alloc_fresh_huge_page(void) 91static void free_huge_page(struct page *page)
92{
93 BUG_ON(page_count(page));
94
95 INIT_LIST_HEAD(&page->lru);
96
97 spin_lock(&hugetlb_lock);
98 enqueue_huge_page(page);
99 spin_unlock(&hugetlb_lock);
100}
101
102static int alloc_fresh_huge_page(void)
68{ 103{
69 static int nid = 0; 104 static int nid = 0;
70 struct page *page; 105 struct page *page;
71 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 106 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
72 HUGETLB_PAGE_ORDER); 107 HUGETLB_PAGE_ORDER);
73 nid = (nid + 1) % num_online_nodes(); 108 nid = next_node(nid, node_online_map);
109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map);
74 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */
75 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
76 nr_huge_pages++; 114 nr_huge_pages++;
77 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
78 spin_unlock(&hugetlb_lock); 116 spin_unlock(&hugetlb_lock);
117 put_page(page); /* free it into the hugepage allocator */
118 return 1;
79 } 119 }
80 return page; 120 return 0;
81} 121}
82 122
83void free_huge_page(struct page *page) 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr)
84{ 125{
85 BUG_ON(page_count(page)); 126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
86 130
87 INIT_LIST_HEAD(&page->lru); 131 spin_lock(&hugetlb_lock);
88 page[1].lru.next = NULL; /* reset dtor */ 132
133 if (vma->vm_flags & VM_MAYSHARE) {
134
135 /* idx = radix tree index, i.e. offset into file in
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159
160 page = dequeue_huge_page(vma, addr);
161 if (!page)
162 goto fail;
163
164 spin_unlock(&hugetlb_lock);
165 set_page_refcounted(page);
166 return page;
167
168 fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock);
171 return NULL;
172}
173
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
89 189
90 spin_lock(&hugetlb_lock); 190 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
92 spin_unlock(&hugetlb_lock); 211 spin_unlock(&hugetlb_lock);
212
213 return ret;
93} 214}
94 215
95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
96{ 225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
97 struct page *page; 230 struct page *page;
98 int i;
99 231
100 spin_lock(&hugetlb_lock); 232 spin_lock(&hugetlb_lock);
101 page = dequeue_huge_page(vma, addr); 233 read_lock_irq(&inode->i_mapping->tree_lock);
102 if (!page) { 234
103 spin_unlock(&hugetlb_lock); 235 if (info->prereserved_hpages <= atmost)
104 return NULL; 236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
105 } 247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
106 spin_unlock(&hugetlb_lock); 255 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1);
108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_user_highpage(&page[i], addr);
111 return page;
112} 256}
113 257
114static int __init hugetlb_init(void) 258static int __init hugetlb_init(void)
115{ 259{
116 unsigned long i; 260 unsigned long i;
117 struct page *page;
118 261
119 if (HPAGE_SHIFT == 0) 262 if (HPAGE_SHIFT == 0)
120 return 0; 263 return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
123 INIT_LIST_HEAD(&hugepage_freelists[i]); 266 INIT_LIST_HEAD(&hugepage_freelists[i]);
124 267
125 for (i = 0; i < max_huge_pages; ++i) { 268 for (i = 0; i < max_huge_pages; ++i) {
126 page = alloc_fresh_huge_page(); 269 if (!alloc_fresh_huge_page())
127 if (!page)
128 break; 270 break;
129 spin_lock(&hugetlb_lock);
130 enqueue_huge_page(page);
131 spin_unlock(&hugetlb_lock);
132 } 271 }
133 max_huge_pages = free_huge_pages = nr_huge_pages = i; 272 max_huge_pages = free_huge_pages = nr_huge_pages = i;
134 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 273 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
154 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 293 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
155 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 294 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
156 1 << PG_private | 1<< PG_writeback); 295 1 << PG_private | 1<< PG_writeback);
157 set_page_count(&page[i], 0);
158 } 296 }
159 set_page_count(page, 1); 297 page[1].lru.next = NULL;
298 set_page_refcounted(page);
160 __free_pages(page, HUGETLB_PAGE_ORDER); 299 __free_pages(page, HUGETLB_PAGE_ORDER);
161} 300}
162 301
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
188static unsigned long set_max_huge_pages(unsigned long count) 327static unsigned long set_max_huge_pages(unsigned long count)
189{ 328{
190 while (count > nr_huge_pages) { 329 while (count > nr_huge_pages) {
191 struct page *page = alloc_fresh_huge_page(); 330 if (!alloc_fresh_huge_page())
192 if (!page)
193 return nr_huge_pages; 331 return nr_huge_pages;
194 spin_lock(&hugetlb_lock);
195 enqueue_huge_page(page);
196 spin_unlock(&hugetlb_lock);
197 } 332 }
198 if (count >= nr_huge_pages) 333 if (count >= nr_huge_pages)
199 return nr_huge_pages; 334 return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
225 return sprintf(buf, 360 return sprintf(buf,
226 "HugePages_Total: %5lu\n" 361 "HugePages_Total: %5lu\n"
227 "HugePages_Free: %5lu\n" 362 "HugePages_Free: %5lu\n"
363 "HugePages_Rsvd: %5lu\n"
228 "Hugepagesize: %5lu kB\n", 364 "Hugepagesize: %5lu kB\n",
229 nr_huge_pages, 365 nr_huge_pages,
230 free_huge_pages, 366 free_huge_pages,
367 reserved_huge_pages,
231 HPAGE_SIZE/1024); 368 HPAGE_SIZE/1024);
232} 369}
233 370
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
240 nid, free_huge_pages_node[nid]); 377 nid, free_huge_pages_node[nid]);
241} 378}
242 379
243int is_hugepage_mem_enough(size_t size)
244{
245 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
246}
247
248/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 380/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
249unsigned long hugetlb_total_pages(void) 381unsigned long hugetlb_total_pages(void)
250{ 382{
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
374 unsigned long address, pte_t *ptep, pte_t pte) 506 unsigned long address, pte_t *ptep, pte_t pte)
375{ 507{
376 struct page *old_page, *new_page; 508 struct page *old_page, *new_page;
377 int i, avoidcopy; 509 int avoidcopy;
378 510
379 old_page = pte_page(pte); 511 old_page = pte_page(pte);
380 512
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
395 } 527 }
396 528
397 spin_unlock(&mm->page_table_lock); 529 spin_unlock(&mm->page_table_lock);
398 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) 530 copy_huge_page(new_page, old_page, address);
399 copy_user_highpage(new_page + i, old_page + i,
400 address + i*PAGE_SIZE);
401 spin_lock(&mm->page_table_lock); 531 spin_lock(&mm->page_table_lock);
402 532
403 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 533 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
442 ret = VM_FAULT_OOM; 572 ret = VM_FAULT_OOM;
443 goto out; 573 goto out;
444 } 574 }
575 clear_huge_page(page, address);
445 576
446 if (vma->vm_flags & VM_SHARED) { 577 if (vma->vm_flags & VM_SHARED) {
447 int err; 578 int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
496 pte_t *ptep; 627 pte_t *ptep;
497 pte_t entry; 628 pte_t entry;
498 int ret; 629 int ret;
630 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
499 631
500 ptep = huge_pte_alloc(mm, address); 632 ptep = huge_pte_alloc(mm, address);
501 if (!ptep) 633 if (!ptep)
502 return VM_FAULT_OOM; 634 return VM_FAULT_OOM;
503 635
636 /*
637 * Serialize hugepage allocation and instantiation, so that we don't
638 * get spurious allocation failures if two CPUs race to instantiate
639 * the same page in the page cache.
640 */
641 mutex_lock(&hugetlb_instantiation_mutex);
504 entry = *ptep; 642 entry = *ptep;
505 if (pte_none(entry)) 643 if (pte_none(entry)) {
506 return hugetlb_no_page(mm, vma, address, ptep, write_access); 644 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
645 mutex_unlock(&hugetlb_instantiation_mutex);
646 return ret;
647 }
507 648
508 ret = VM_FAULT_MINOR; 649 ret = VM_FAULT_MINOR;
509 650
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
513 if (write_access && !pte_write(entry)) 654 if (write_access && !pte_write(entry))
514 ret = hugetlb_cow(mm, vma, address, ptep, entry); 655 ret = hugetlb_cow(mm, vma, address, ptep, entry);
515 spin_unlock(&mm->page_table_lock); 656 spin_unlock(&mm->page_table_lock);
657 mutex_unlock(&hugetlb_instantiation_mutex);
516 658
517 return ret; 659 return ret;
518} 660}
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
521 struct page **pages, struct vm_area_struct **vmas, 663 struct page **pages, struct vm_area_struct **vmas,
522 unsigned long *position, int *length, int i) 664 unsigned long *position, int *length, int i)
523{ 665{
524 unsigned long vpfn, vaddr = *position; 666 unsigned long pfn_offset;
667 unsigned long vaddr = *position;
525 int remainder = *length; 668 int remainder = *length;
526 669
527 vpfn = vaddr/PAGE_SIZE;
528 spin_lock(&mm->page_table_lock); 670 spin_lock(&mm->page_table_lock);
529 while (vaddr < vma->vm_end && remainder) { 671 while (vaddr < vma->vm_end && remainder) {
530 pte_t *pte; 672 pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
552 break; 694 break;
553 } 695 }
554 696
555 if (pages) { 697 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
556 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 698 page = pte_page(*pte);
557 get_page(page); 699same_page:
558 pages[i] = page; 700 get_page(page);
559 } 701 if (pages)
702 pages[i] = page + pfn_offset;
560 703
561 if (vmas) 704 if (vmas)
562 vmas[i] = vma; 705 vmas[i] = vma;
563 706
564 vaddr += PAGE_SIZE; 707 vaddr += PAGE_SIZE;
565 ++vpfn; 708 ++pfn_offset;
566 --remainder; 709 --remainder;
567 ++i; 710 ++i;
711 if (vaddr < vma->vm_end && remainder &&
712 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
713 /*
714 * We use pfn_offset to avoid touching the pageframes
715 * of this compound page.
716 */
717 goto same_page;
718 }
568 } 719 }
569 spin_unlock(&mm->page_table_lock); 720 spin_unlock(&mm->page_table_lock);
570 *length = remainder; 721 *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
572 723
573 return i; 724 return i;
574} 725}
726
727void hugetlb_change_protection(struct vm_area_struct *vma,
728 unsigned long address, unsigned long end, pgprot_t newprot)
729{
730 struct mm_struct *mm = vma->vm_mm;
731 unsigned long start = address;
732 pte_t *ptep;
733 pte_t pte;
734
735 BUG_ON(address >= end);
736 flush_cache_range(vma, address, end);
737
738 spin_lock(&mm->page_table_lock);
739 for (; address < end; address += HPAGE_SIZE) {
740 ptep = huge_pte_offset(mm, address);
741 if (!ptep)
742 continue;
743 if (!pte_none(*ptep)) {
744 pte = huge_ptep_get_and_clear(mm, address, ptep);
745 pte = pte_mkhuge(pte_modify(pte, newprot));
746 set_huge_pte_at(mm, address, ptep, pte);
747 lazy_mmu_prot_update(pte);
748 }
749 }
750 spin_unlock(&mm->page_table_lock);
751
752 flush_tlb_range(vma, start, end);
753}
754
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4ef..d20e3cc4aef0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11#ifndef __MM_INTERNAL_H
12#define __MM_INTERNAL_H
11 13
12static inline void set_page_refs(struct page *page, int order) 14#include <linux/mm.h>
15
16static inline void set_page_count(struct page *page, int v)
17{
18 atomic_set(&page->_count, v);
19}
20
21/*
22 * Turn a non-refcounted page (->_count == 0) into refcounted with
23 * a count of one.
24 */
25static inline void set_page_refcounted(struct page *page)
13{ 26{
14#ifdef CONFIG_MMU 27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count));
15 set_page_count(page, 1); 29 set_page_count(page, 1);
16#else 30}
17 int i;
18 31
19 /* 32static inline void __put_page(struct page *page)
20 * We need to reference all the pages for this order, otherwise if 33{
21 * anyone accesses one of the pages with (get/put) it will be freed. 34 atomic_dec(&page->_count);
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27} 35}
28 36
29extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order); 38 unsigned int order);
39
40#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db29..80c3fb370f91 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
277 anon_vma_unlink(vma); 277 anon_vma_unlink(vma);
278 unlink_file_vma(vma); 278 unlink_file_vma(vma);
279 279
280 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 280 if (is_vm_hugetlb_page(vma)) {
281 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 281 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
282 floor, next? next->vm_start: ceiling); 282 floor, next? next->vm_start: ceiling);
283 } else { 283 } else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
285 * Optimization: gather nearby vmas into one call down 285 * Optimization: gather nearby vmas into one call down
286 */ 286 */
287 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 287 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
288 && !is_hugepage_only_range(vma->vm_mm, next->vm_start, 288 && !is_vm_hugetlb_page(next)) {
289 HPAGE_SIZE)) {
290 vma = next; 289 vma = next;
291 next = vma->vm_next; 290 next = vma->vm_next;
292 anon_vma_unlink(vma); 291 anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
388{ 387{
389 unsigned long pfn = pte_pfn(pte); 388 unsigned long pfn = pte_pfn(pte);
390 389
391 if (vma->vm_flags & VM_PFNMAP) { 390 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
392 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; 391 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
393 if (pfn == vma->vm_pgoff + off) 392 if (pfn == vma->vm_pgoff + off)
394 return NULL; 393 return NULL;
@@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
396 return NULL; 395 return NULL;
397 } 396 }
398 397
399 /* 398#ifdef CONFIG_DEBUG_VM
400 * Add some anal sanity checks for now. Eventually,
401 * we should just do "return pfn_to_page(pfn)", but
402 * in the meantime we check that we get a valid pfn,
403 * and that the resulting page looks ok.
404 *
405 * Remove this test eventually!
406 */
407 if (unlikely(!pfn_valid(pfn))) { 399 if (unlikely(!pfn_valid(pfn))) {
408 print_bad_pte(vma, pte, addr); 400 print_bad_pte(vma, pte, addr);
409 return NULL; 401 return NULL;
410 } 402 }
403#endif
411 404
412 /* 405 /*
413 * NOTE! We still have PageReserved() pages in the page 406 * NOTE! We still have PageReserved() pages in the page
@@ -1221,9 +1214,7 @@ out:
1221 * The page has to be a nice clean _individual_ kernel allocation. 1214 * The page has to be a nice clean _individual_ kernel allocation.
1222 * If you allocate a compound page, you need to have marked it as 1215 * If you allocate a compound page, you need to have marked it as
1223 * such (__GFP_COMP), or manually just split the page up yourself 1216 * such (__GFP_COMP), or manually just split the page up yourself
1224 * (which is mainly an issue of doing "set_page_count(page, 1)" for 1217 * (see split_page()).
1225 * each sub-page, and then freeing them one by one when you free
1226 * them rather than freeing it as a compound page).
1227 * 1218 *
1228 * NOTE! Traditionally this was done with "remap_pfn_range()" which 1219 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1229 * took an arbitrary page protection parameter. This doesn't allow 1220 * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f0b..e93cc740c22b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
86#include <linux/swap.h> 86#include <linux/swap.h>
87#include <linux/seq_file.h> 87#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 88#include <linux/proc_fs.h>
89#include <linux/migrate.h>
89 90
90#include <asm/tlbflush.h> 91#include <asm/tlbflush.h>
91#include <asm/uaccess.h> 92#include <asm/uaccess.h>
@@ -95,11 +96,8 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 96#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 97#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 98
98/* The number of pages to migrate per call to migrate_pages() */ 99static struct kmem_cache *policy_cache;
99#define MIGRATE_CHUNK_SIZE 256 100static struct kmem_cache *sn_cache;
100
101static kmem_cache_t *policy_cache;
102static kmem_cache_t *sn_cache;
103 101
104#define PDprintk(fmt...) 102#define PDprintk(fmt...)
105 103
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 struct vm_area_struct *first, *vma, *prev; 329 struct vm_area_struct *first, *vma, *prev;
332 330
333 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 331 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
334 /* Must have swap device for migration */
335 if (nr_swap_pages <= 0)
336 return ERR_PTR(-ENODEV);
337 332
338 /* 333 err = migrate_prep();
339 * Clear the LRU lists so pages can be isolated. 334 if (err)
340 * Note that pages may be moved off the LRU after we have 335 return ERR_PTR(err);
341 * drained them. Those pages will fail to migrate like other
342 * pages that may be busy.
343 */
344 lru_add_drain_all();
345 } 336 }
346 337
347 first = find_vma(mm, start); 338 first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
550 return err; 541 return err;
551} 542}
552 543
544#ifdef CONFIG_MIGRATION
553/* 545/*
554 * page migration 546 * page migration
555 */ 547 */
556
557static void migrate_page_add(struct page *page, struct list_head *pagelist, 548static void migrate_page_add(struct page *page, struct list_head *pagelist,
558 unsigned long flags) 549 unsigned long flags)
559{ 550{
560 /* 551 /*
561 * Avoid migrating a page that is shared with others. 552 * Avoid migrating a page that is shared with others.
562 */ 553 */
563 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 554 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
564 if (isolate_lru_page(page)) 555 isolate_lru_page(page, pagelist);
565 list_add_tail(&page->lru, pagelist);
566 }
567}
568
569/*
570 * Migrate the list 'pagelist' of pages to a certain destination.
571 *
572 * Specify destination with either non-NULL vma or dest_node >= 0
573 * Return the number of pages not migrated or error code
574 */
575static int migrate_pages_to(struct list_head *pagelist,
576 struct vm_area_struct *vma, int dest)
577{
578 LIST_HEAD(newlist);
579 LIST_HEAD(moved);
580 LIST_HEAD(failed);
581 int err = 0;
582 unsigned long offset = 0;
583 int nr_pages;
584 struct page *page;
585 struct list_head *p;
586
587redo:
588 nr_pages = 0;
589 list_for_each(p, pagelist) {
590 if (vma) {
591 /*
592 * The address passed to alloc_page_vma is used to
593 * generate the proper interleave behavior. We fake
594 * the address here by an increasing offset in order
595 * to get the proper distribution of pages.
596 *
597 * No decision has been made as to which page
598 * a certain old page is moved to so we cannot
599 * specify the correct address.
600 */
601 page = alloc_page_vma(GFP_HIGHUSER, vma,
602 offset + vma->vm_start);
603 offset += PAGE_SIZE;
604 }
605 else
606 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
607
608 if (!page) {
609 err = -ENOMEM;
610 goto out;
611 }
612 list_add_tail(&page->lru, &newlist);
613 nr_pages++;
614 if (nr_pages > MIGRATE_CHUNK_SIZE)
615 break;
616 }
617 err = migrate_pages(pagelist, &newlist, &moved, &failed);
618
619 putback_lru_pages(&moved); /* Call release pages instead ?? */
620
621 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
622 goto redo;
623out:
624 /* Return leftover allocated pages */
625 while (!list_empty(&newlist)) {
626 page = list_entry(newlist.next, struct page, lru);
627 list_del(&page->lru);
628 __free_page(page);
629 }
630 list_splice(&failed, pagelist);
631 if (err < 0)
632 return err;
633
634 /* Calculate number of leftover pages */
635 nr_pages = 0;
636 list_for_each(p, pagelist)
637 nr_pages++;
638 return nr_pages;
639} 556}
640 557
641/* 558/*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
742 if (err < 0) 659 if (err < 0)
743 return err; 660 return err;
744 return busy; 661 return busy;
662
745} 663}
746 664
665#else
666
667static void migrate_page_add(struct page *page, struct list_head *pagelist,
668 unsigned long flags)
669{
670}
671
672int do_migrate_pages(struct mm_struct *mm,
673 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
674{
675 return -ENOSYS;
676}
677#endif
678
747long do_mbind(unsigned long start, unsigned long len, 679long do_mbind(unsigned long start, unsigned long len,
748 unsigned long mode, nodemask_t *nmask, unsigned long flags) 680 unsigned long mode, nodemask_t *nmask, unsigned long flags)
749{ 681{
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
808 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 740 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
809 err = -EIO; 741 err = -EIO;
810 } 742 }
743
811 if (!list_empty(&pagelist)) 744 if (!list_empty(&pagelist))
812 putback_lru_pages(&pagelist); 745 putback_lru_pages(&pagelist);
813 746
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480d3..f71893ed3543 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free);
278 */ 278 */
279void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) 279void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
280{ 280{
281 kmem_cache_t *mem = (kmem_cache_t *) pool_data; 281 struct kmem_cache *mem = pool_data;
282 return kmem_cache_alloc(mem, gfp_mask); 282 return kmem_cache_alloc(mem, gfp_mask);
283} 283}
284EXPORT_SYMBOL(mempool_alloc_slab); 284EXPORT_SYMBOL(mempool_alloc_slab);
285 285
286void mempool_free_slab(void *element, void *pool_data) 286void mempool_free_slab(void *element, void *pool_data)
287{ 287{
288 kmem_cache_t *mem = (kmem_cache_t *) pool_data; 288 struct kmem_cache *mem = pool_data;
289 kmem_cache_free(mem, element); 289 kmem_cache_free(mem, element);
290} 290}
291EXPORT_SYMBOL(mempool_free_slab); 291EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000000..09f6e4aa87fc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter <clameter@sgi.com>
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/buffer_head.h> /* for try_to_release_page(),
20 buffer_heads_over_limit */
21#include <linux/mm_inline.h>
22#include <linux/pagevec.h>
23#include <linux/rmap.h>
24#include <linux/topology.h>
25#include <linux/cpu.h>
26#include <linux/cpuset.h>
27#include <linux/swapops.h>
28
29#include "internal.h"
30
31#include "internal.h"
32
33/* The maximum number of pages to take off the LRU for migration */
34#define MIGRATE_CHUNK_SIZE 256
35
36#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
37
38/*
39 * Isolate one page from the LRU lists. If successful put it onto
40 * the indicated list with elevated page count.
41 *
42 * Result:
43 * -EBUSY: page not on LRU list
44 * 0: page removed from LRU list and added to the specified list.
45 */
46int isolate_lru_page(struct page *page, struct list_head *pagelist)
47{
48 int ret = -EBUSY;
49
50 if (PageLRU(page)) {
51 struct zone *zone = page_zone(page);
52
53 spin_lock_irq(&zone->lru_lock);
54 if (PageLRU(page)) {
55 ret = 0;
56 get_page(page);
57 ClearPageLRU(page);
58 if (PageActive(page))
59 del_page_from_active_list(zone, page);
60 else
61 del_page_from_inactive_list(zone, page);
62 list_add_tail(&page->lru, pagelist);
63 }
64 spin_unlock_irq(&zone->lru_lock);
65 }
66 return ret;
67}
68
69/*
70 * migrate_prep() needs to be called after we have compiled the list of pages
71 * to be migrated using isolate_lru_page() but before we begin a series of calls
72 * to migrate_pages().
73 */
74int migrate_prep(void)
75{
76 /* Must have swap device for migration */
77 if (nr_swap_pages <= 0)
78 return -ENODEV;
79
80 /*
81 * Clear the LRU lists so pages can be isolated.
82 * Note that pages may be moved off the LRU after we have
83 * drained them. Those pages will fail to migrate like other
84 * pages that may be busy.
85 */
86 lru_add_drain_all();
87
88 return 0;
89}
90
91static inline void move_to_lru(struct page *page)
92{
93 list_del(&page->lru);
94 if (PageActive(page)) {
95 /*
96 * lru_cache_add_active checks that
97 * the PG_active bit is off.
98 */
99 ClearPageActive(page);
100 lru_cache_add_active(page);
101 } else {
102 lru_cache_add(page);
103 }
104 put_page(page);
105}
106
107/*
108 * Add isolated pages on the list back to the LRU.
109 *
110 * returns the number of pages put back.
111 */
112int putback_lru_pages(struct list_head *l)
113{
114 struct page *page;
115 struct page *page2;
116 int count = 0;
117
118 list_for_each_entry_safe(page, page2, l, lru) {
119 move_to_lru(page);
120 count++;
121 }
122 return count;
123}
124
125/*
126 * Non migratable page
127 */
128int fail_migrate_page(struct page *newpage, struct page *page)
129{
130 return -EIO;
131}
132EXPORT_SYMBOL(fail_migrate_page);
133
134/*
135 * swapout a single page
136 * page is locked upon entry, unlocked on exit
137 */
138static int swap_page(struct page *page)
139{
140 struct address_space *mapping = page_mapping(page);
141
142 if (page_mapped(page) && mapping)
143 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
144 goto unlock_retry;
145
146 if (PageDirty(page)) {
147 /* Page is dirty, try to write it out here */
148 switch(pageout(page, mapping)) {
149 case PAGE_KEEP:
150 case PAGE_ACTIVATE:
151 goto unlock_retry;
152
153 case PAGE_SUCCESS:
154 goto retry;
155
156 case PAGE_CLEAN:
157 ; /* try to free the page below */
158 }
159 }
160
161 if (PagePrivate(page)) {
162 if (!try_to_release_page(page, GFP_KERNEL) ||
163 (!mapping && page_count(page) == 1))
164 goto unlock_retry;
165 }
166
167 if (remove_mapping(mapping, page)) {
168 /* Success */
169 unlock_page(page);
170 return 0;
171 }
172
173unlock_retry:
174 unlock_page(page);
175
176retry:
177 return -EAGAIN;
178}
179EXPORT_SYMBOL(swap_page);
180
181/*
182 * Remove references for a page and establish the new page with the correct
183 * basic settings to be able to stop accesses to the page.
184 */
185int migrate_page_remove_references(struct page *newpage,
186 struct page *page, int nr_refs)
187{
188 struct address_space *mapping = page_mapping(page);
189 struct page **radix_pointer;
190
191 /*
192 * Avoid doing any of the following work if the page count
193 * indicates that the page is in use or truncate has removed
194 * the page.
195 */
196 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
197 return -EAGAIN;
198
199 /*
200 * Establish swap ptes for anonymous pages or destroy pte
201 * maps for files.
202 *
203 * In order to reestablish file backed mappings the fault handlers
204 * will take the radix tree_lock which may then be used to stop
205 * processses from accessing this page until the new page is ready.
206 *
207 * A process accessing via a swap pte (an anonymous page) will take a
208 * page_lock on the old page which will block the process until the
209 * migration attempt is complete. At that time the PageSwapCache bit
210 * will be examined. If the page was migrated then the PageSwapCache
211 * bit will be clear and the operation to retrieve the page will be
212 * retried which will find the new page in the radix tree. Then a new
213 * direct mapping may be generated based on the radix tree contents.
214 *
215 * If the page was not migrated then the PageSwapCache bit
216 * is still set and the operation may continue.
217 */
218 if (try_to_unmap(page, 1) == SWAP_FAIL)
219 /* A vma has VM_LOCKED set -> permanent failure */
220 return -EPERM;
221
222 /*
223 * Give up if we were unable to remove all mappings.
224 */
225 if (page_mapcount(page))
226 return -EAGAIN;
227
228 write_lock_irq(&mapping->tree_lock);
229
230 radix_pointer = (struct page **)radix_tree_lookup_slot(
231 &mapping->page_tree,
232 page_index(page));
233
234 if (!page_mapping(page) || page_count(page) != nr_refs ||
235 *radix_pointer != page) {
236 write_unlock_irq(&mapping->tree_lock);
237 return 1;
238 }
239
240 /*
241 * Now we know that no one else is looking at the page.
242 *
243 * Certain minimal information about a page must be available
244 * in order for other subsystems to properly handle the page if they
245 * find it through the radix tree update before we are finished
246 * copying the page.
247 */
248 get_page(newpage);
249 newpage->index = page->index;
250 newpage->mapping = page->mapping;
251 if (PageSwapCache(page)) {
252 SetPageSwapCache(newpage);
253 set_page_private(newpage, page_private(page));
254 }
255
256 *radix_pointer = newpage;
257 __put_page(page);
258 write_unlock_irq(&mapping->tree_lock);
259
260 return 0;
261}
262EXPORT_SYMBOL(migrate_page_remove_references);
263
264/*
265 * Copy the page to its new location
266 */
267void migrate_page_copy(struct page *newpage, struct page *page)
268{
269 copy_highpage(newpage, page);
270
271 if (PageError(page))
272 SetPageError(newpage);
273 if (PageReferenced(page))
274 SetPageReferenced(newpage);
275 if (PageUptodate(page))
276 SetPageUptodate(newpage);
277 if (PageActive(page))
278 SetPageActive(newpage);
279 if (PageChecked(page))
280 SetPageChecked(newpage);
281 if (PageMappedToDisk(page))
282 SetPageMappedToDisk(newpage);
283
284 if (PageDirty(page)) {
285 clear_page_dirty_for_io(page);
286 set_page_dirty(newpage);
287 }
288
289 ClearPageSwapCache(page);
290 ClearPageActive(page);
291 ClearPagePrivate(page);
292 set_page_private(page, 0);
293 page->mapping = NULL;
294
295 /*
296 * If any waiters have accumulated on the new page then
297 * wake them up.
298 */
299 if (PageWriteback(newpage))
300 end_page_writeback(newpage);
301}
302EXPORT_SYMBOL(migrate_page_copy);
303
304/*
305 * Common logic to directly migrate a single page suitable for
306 * pages that do not use PagePrivate.
307 *
308 * Pages are locked upon entry and exit.
309 */
310int migrate_page(struct page *newpage, struct page *page)
311{
312 int rc;
313
314 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
315
316 rc = migrate_page_remove_references(newpage, page, 2);
317
318 if (rc)
319 return rc;
320
321 migrate_page_copy(newpage, page);
322
323 /*
324 * Remove auxiliary swap entries and replace
325 * them with real ptes.
326 *
327 * Note that a real pte entry will allow processes that are not
328 * waiting on the page lock to use the new page via the page tables
329 * before the new page is unlocked.
330 */
331 remove_from_swap(newpage);
332 return 0;
333}
334EXPORT_SYMBOL(migrate_page);
335
336/*
337 * migrate_pages
338 *
339 * Two lists are passed to this function. The first list
340 * contains the pages isolated from the LRU to be migrated.
341 * The second list contains new pages that the pages isolated
342 * can be moved to. If the second list is NULL then all
343 * pages are swapped out.
344 *
345 * The function returns after 10 attempts or if no pages
346 * are movable anymore because to has become empty
347 * or no retryable pages exist anymore.
348 *
349 * Return: Number of pages not migrated when "to" ran empty.
350 */
351int migrate_pages(struct list_head *from, struct list_head *to,
352 struct list_head *moved, struct list_head *failed)
353{
354 int retry;
355 int nr_failed = 0;
356 int pass = 0;
357 struct page *page;
358 struct page *page2;
359 int swapwrite = current->flags & PF_SWAPWRITE;
360 int rc;
361
362 if (!swapwrite)
363 current->flags |= PF_SWAPWRITE;
364
365redo:
366 retry = 0;
367
368 list_for_each_entry_safe(page, page2, from, lru) {
369 struct page *newpage = NULL;
370 struct address_space *mapping;
371
372 cond_resched();
373
374 rc = 0;
375 if (page_count(page) == 1)
376 /* page was freed from under us. So we are done. */
377 goto next;
378
379 if (to && list_empty(to))
380 break;
381
382 /*
383 * Skip locked pages during the first two passes to give the
384 * functions holding the lock time to release the page. Later we
385 * use lock_page() to have a higher chance of acquiring the
386 * lock.
387 */
388 rc = -EAGAIN;
389 if (pass > 2)
390 lock_page(page);
391 else
392 if (TestSetPageLocked(page))
393 goto next;
394
395 /*
396 * Only wait on writeback if we have already done a pass where
397 * we we may have triggered writeouts for lots of pages.
398 */
399 if (pass > 0) {
400 wait_on_page_writeback(page);
401 } else {
402 if (PageWriteback(page))
403 goto unlock_page;
404 }
405
406 /*
407 * Anonymous pages must have swap cache references otherwise
408 * the information contained in the page maps cannot be
409 * preserved.
410 */
411 if (PageAnon(page) && !PageSwapCache(page)) {
412 if (!add_to_swap(page, GFP_KERNEL)) {
413 rc = -ENOMEM;
414 goto unlock_page;
415 }
416 }
417
418 if (!to) {
419 rc = swap_page(page);
420 goto next;
421 }
422
423 newpage = lru_to_page(to);
424 lock_page(newpage);
425
426 /*
427 * Pages are properly locked and writeback is complete.
428 * Try to migrate the page.
429 */
430 mapping = page_mapping(page);
431 if (!mapping)
432 goto unlock_both;
433
434 if (mapping->a_ops->migratepage) {
435 /*
436 * Most pages have a mapping and most filesystems
437 * should provide a migration function. Anonymous
438 * pages are part of swap space which also has its
439 * own migration function. This is the most common
440 * path for page migration.
441 */
442 rc = mapping->a_ops->migratepage(newpage, page);
443 goto unlock_both;
444 }
445
446 /*
447 * Default handling if a filesystem does not provide
448 * a migration function. We can only migrate clean
449 * pages so try to write out any dirty pages first.
450 */
451 if (PageDirty(page)) {
452 switch (pageout(page, mapping)) {
453 case PAGE_KEEP:
454 case PAGE_ACTIVATE:
455 goto unlock_both;
456
457 case PAGE_SUCCESS:
458 unlock_page(newpage);
459 goto next;
460
461 case PAGE_CLEAN:
462 ; /* try to migrate the page below */
463 }
464 }
465
466 /*
467 * Buffers are managed in a filesystem specific way.
468 * We must have no buffers or drop them.
469 */
470 if (!page_has_buffers(page) ||
471 try_to_release_page(page, GFP_KERNEL)) {
472 rc = migrate_page(newpage, page);
473 goto unlock_both;
474 }
475
476 /*
477 * On early passes with mapped pages simply
478 * retry. There may be a lock held for some
479 * buffers that may go away. Later
480 * swap them out.
481 */
482 if (pass > 4) {
483 /*
484 * Persistently unable to drop buffers..... As a
485 * measure of last resort we fall back to
486 * swap_page().
487 */
488 unlock_page(newpage);
489 newpage = NULL;
490 rc = swap_page(page);
491 goto next;
492 }
493
494unlock_both:
495 unlock_page(newpage);
496
497unlock_page:
498 unlock_page(page);
499
500next:
501 if (rc == -EAGAIN) {
502 retry++;
503 } else if (rc) {
504 /* Permanent failure */
505 list_move(&page->lru, failed);
506 nr_failed++;
507 } else {
508 if (newpage) {
509 /* Successful migration. Return page to LRU */
510 move_to_lru(newpage);
511 }
512 list_move(&page->lru, moved);
513 }
514 }
515 if (retry && pass++ < 10)
516 goto redo;
517
518 if (!swapwrite)
519 current->flags &= ~PF_SWAPWRITE;
520
521 return nr_failed + retry;
522}
523
524/*
525 * Migration function for pages with buffers. This function can only be used
526 * if the underlying filesystem guarantees that no other references to "page"
527 * exist.
528 */
529int buffer_migrate_page(struct page *newpage, struct page *page)
530{
531 struct address_space *mapping = page->mapping;
532 struct buffer_head *bh, *head;
533 int rc;
534
535 if (!mapping)
536 return -EAGAIN;
537
538 if (!page_has_buffers(page))
539 return migrate_page(newpage, page);
540
541 head = page_buffers(page);
542
543 rc = migrate_page_remove_references(newpage, page, 3);
544
545 if (rc)
546 return rc;
547
548 bh = head;
549 do {
550 get_bh(bh);
551 lock_buffer(bh);
552 bh = bh->b_this_page;
553
554 } while (bh != head);
555
556 ClearPagePrivate(page);
557 set_page_private(newpage, page_private(page));
558 set_page_private(page, 0);
559 put_page(page);
560 get_page(newpage);
561
562 bh = head;
563 do {
564 set_bh_page(bh, newpage, bh_offset(bh));
565 bh = bh->b_this_page;
566
567 } while (bh != head);
568
569 SetPagePrivate(newpage);
570
571 migrate_page_copy(newpage, page);
572
573 bh = head;
574 do {
575 unlock_buffer(bh);
576 put_bh(bh);
577 bh = bh->b_this_page;
578
579 } while (bh != head);
580
581 return 0;
582}
583EXPORT_SYMBOL(buffer_migrate_page);
584
585/*
586 * Migrate the list 'pagelist' of pages to a certain destination.
587 *
588 * Specify destination with either non-NULL vma or dest_node >= 0
589 * Return the number of pages not migrated or error code
590 */
591int migrate_pages_to(struct list_head *pagelist,
592 struct vm_area_struct *vma, int dest)
593{
594 LIST_HEAD(newlist);
595 LIST_HEAD(moved);
596 LIST_HEAD(failed);
597 int err = 0;
598 unsigned long offset = 0;
599 int nr_pages;
600 struct page *page;
601 struct list_head *p;
602
603redo:
604 nr_pages = 0;
605 list_for_each(p, pagelist) {
606 if (vma) {
607 /*
608 * The address passed to alloc_page_vma is used to
609 * generate the proper interleave behavior. We fake
610 * the address here by an increasing offset in order
611 * to get the proper distribution of pages.
612 *
613 * No decision has been made as to which page
614 * a certain old page is moved to so we cannot
615 * specify the correct address.
616 */
617 page = alloc_page_vma(GFP_HIGHUSER, vma,
618 offset + vma->vm_start);
619 offset += PAGE_SIZE;
620 }
621 else
622 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
623
624 if (!page) {
625 err = -ENOMEM;
626 goto out;
627 }
628 list_add_tail(&page->lru, &newlist);
629 nr_pages++;
630 if (nr_pages > MIGRATE_CHUNK_SIZE)
631 break;
632 }
633 err = migrate_pages(pagelist, &newlist, &moved, &failed);
634
635 putback_lru_pages(&moved); /* Call release pages instead ?? */
636
637 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
638 goto redo;
639out:
640 /* Return leftover allocated pages */
641 while (!list_empty(&newlist)) {
642 page = list_entry(newlist.next, struct page, lru);
643 list_del(&page->lru);
644 __free_page(page);
645 }
646 list_splice(&failed, pagelist);
647 if (err < 0)
648 return err;
649
650 /* Calculate number of leftover pages */
651 nr_pages = 0;
652 list_for_each(p, pagelist)
653 nr_pages++;
654 return nr_pages;
655}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e90..0eb9894db6de 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end);
612 * If the vma has a ->close operation then the driver probably needs to release 612 * If the vma has a ->close operation then the driver probably needs to release
613 * per-vma resources, so we don't attempt to merge those. 613 * per-vma resources, so we don't attempt to merge those.
614 */ 614 */
615#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) 615#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
616 616
617static inline int is_mergeable_vma(struct vm_area_struct *vma, 617static inline int is_mergeable_vma(struct vm_area_struct *vma,
618 struct file *file, unsigned long vm_flags) 618 struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
845 const unsigned long stack_flags 845 const unsigned long stack_flags
846 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 846 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
847 847
848#ifdef CONFIG_HUGETLB
849 if (flags & VM_HUGETLB) {
850 if (!(flags & VM_DONTCOPY))
851 mm->shared_vm += pages;
852 return;
853 }
854#endif /* CONFIG_HUGETLB */
855
856 if (file) { 848 if (file) {
857 mm->shared_vm += pages; 849 mm->shared_vm += pages;
858 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 850 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1ed..4c14d4289b61 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
124 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
125 */ 125 */
126 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
128 charged = nrpages; 128 charged = nrpages;
129 if (security_vm_enough_memory(charged)) 129 if (security_vm_enough_memory(charged))
130 return -ENOMEM; 130 return -ENOMEM;
@@ -166,7 +166,10 @@ success:
166 */ 166 */
167 vma->vm_flags = newflags; 167 vma->vm_flags = newflags;
168 vma->vm_page_prot = newprot; 168 vma->vm_page_prot = newprot;
169 change_protection(vma, start, end, newprot); 169 if (is_vm_hugetlb_page(vma))
170 hugetlb_change_protection(vma, start, end, newprot);
171 else
172 change_protection(vma, start, end, newprot);
170 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 173 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
171 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 174 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
172 return 0; 175 return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
240 243
241 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 244 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
242 245
243 if (is_vm_hugetlb_page(vma)) {
244 error = -EACCES;
245 goto out;
246 }
247
248 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 246 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
249 247
250 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 248 /* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f28..db45efac17cc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
159 /* 159 /*
160 * kmalloc doesn't like __GFP_HIGHMEM for some reason 160 * kmalloc doesn't like __GFP_HIGHMEM for some reason
161 */ 161 */
162 return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); 162 return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
163} 163}
164 164
165struct page * vmalloc_to_page(void *addr) 165struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
623 * - note that this may not return a page-aligned address if the object 623 * - note that this may not return a page-aligned address if the object
624 * we're allocating is smaller than a page 624 * we're allocating is smaller than a page
625 */ 625 */
626 base = kmalloc(len, GFP_KERNEL); 626 base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
627 if (!base) 627 if (!base)
628 goto enomem; 628 goto enomem;
629 629
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d14..b7f14a4799a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly;
55long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction; 56int percpu_pagelist_fraction;
57 57
58static void fastcall free_hot_cold_page(struct page *page, int cold);
59static void __free_pages_ok(struct page *page, unsigned int order); 58static void __free_pages_ok(struct page *page, unsigned int order);
60 59
61/* 60/*
@@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
190 for (i = 0; i < nr_pages; i++) { 189 for (i = 0; i < nr_pages; i++) {
191 struct page *p = page + i; 190 struct page *p = page + i;
192 191
193 SetPageCompound(p); 192 __SetPageCompound(p);
194 set_page_private(p, (unsigned long)page); 193 set_page_private(p, (unsigned long)page);
195 } 194 }
196} 195}
@@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
209 if (unlikely(!PageCompound(p) | 208 if (unlikely(!PageCompound(p) |
210 (page_private(p) != (unsigned long)page))) 209 (page_private(p) != (unsigned long)page)))
211 bad_page(page); 210 bad_page(page);
212 ClearPageCompound(p); 211 __ClearPageCompound(p);
213 } 212 }
214} 213}
215 214
215static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
216{
217 int i;
218
219 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
220 /*
221 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
222 * and __GFP_HIGHMEM from hard or soft interrupt context.
223 */
224 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
225 for (i = 0; i < (1 << order); i++)
226 clear_highpage(page + i);
227}
228
216/* 229/*
217 * function for dealing with page's order in buddy system. 230 * function for dealing with page's order in buddy system.
218 * zone->lock is already acquired when we use these. 231 * zone->lock is already acquired when we use these.
@@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
423 mutex_debug_check_no_locks_freed(page_address(page), 436 mutex_debug_check_no_locks_freed(page_address(page),
424 PAGE_SIZE<<order); 437 PAGE_SIZE<<order);
425 438
426#ifndef CONFIG_MMU
427 for (i = 1 ; i < (1 << order) ; ++i)
428 __put_page(page + i);
429#endif
430
431 for (i = 0 ; i < (1 << order) ; ++i) 439 for (i = 0 ; i < (1 << order) ; ++i)
432 reserved += free_pages_check(page + i); 440 reserved += free_pages_check(page + i);
433 if (reserved) 441 if (reserved)
@@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
448 if (order == 0) { 456 if (order == 0) {
449 __ClearPageReserved(page); 457 __ClearPageReserved(page);
450 set_page_count(page, 0); 458 set_page_count(page, 0);
451 459 set_page_refcounted(page);
452 free_hot_cold_page(page, 0); 460 __free_page(page);
453 } else { 461 } else {
454 LIST_HEAD(list);
455 int loop; 462 int loop;
456 463
464 prefetchw(page);
457 for (loop = 0; loop < BITS_PER_LONG; loop++) { 465 for (loop = 0; loop < BITS_PER_LONG; loop++) {
458 struct page *p = &page[loop]; 466 struct page *p = &page[loop];
459 467
460 if (loop + 16 < BITS_PER_LONG) 468 if (loop + 1 < BITS_PER_LONG)
461 prefetchw(p + 16); 469 prefetchw(p + 1);
462 __ClearPageReserved(p); 470 __ClearPageReserved(p);
463 set_page_count(p, 0); 471 set_page_count(p, 0);
464 } 472 }
465 473
466 arch_free_page(page, order); 474 set_page_refcounted(page);
467 475 __free_pages(page, order);
468 mod_page_state(pgfree, 1 << order);
469
470 list_add(&page->lru, &list);
471 kernel_map_pages(page, 1 << order, 0);
472 free_pages_bulk(page_zone(page), 1, &list, order);
473 } 476 }
474} 477}
475 478
@@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page,
507/* 510/*
508 * This page is about to be returned from the page allocator 511 * This page is about to be returned from the page allocator
509 */ 512 */
510static int prep_new_page(struct page *page, int order) 513static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
511{ 514{
512 if (unlikely(page_mapcount(page) | 515 if (unlikely(page_mapcount(page) |
513 (page->mapping != NULL) | 516 (page->mapping != NULL) |
@@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order)
536 1 << PG_referenced | 1 << PG_arch_1 | 539 1 << PG_referenced | 1 << PG_arch_1 |
537 1 << PG_checked | 1 << PG_mappedtodisk); 540 1 << PG_checked | 1 << PG_mappedtodisk);
538 set_page_private(page, 0); 541 set_page_private(page, 0);
539 set_page_refs(page, order); 542 set_page_refcounted(page);
540 kernel_map_pages(page, 1 << order, 1); 543 kernel_map_pages(page, 1 << order, 1);
544
545 if (gfp_flags & __GFP_ZERO)
546 prep_zero_page(page, order, gfp_flags);
547
548 if (order && (gfp_flags & __GFP_COMP))
549 prep_compound_page(page, order);
550
541 return 0; 551 return 0;
542} 552}
543 553
@@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
593/* 603/*
594 * Called from the slab reaper to drain pagesets on a particular node that 604 * Called from the slab reaper to drain pagesets on a particular node that
595 * belong to the currently executing processor. 605 * belong to the currently executing processor.
606 * Note that this function must be called with the thread pinned to
607 * a single processor.
596 */ 608 */
597void drain_node_pages(int nodeid) 609void drain_node_pages(int nodeid)
598{ 610{
599 int i, z; 611 int i, z;
600 unsigned long flags; 612 unsigned long flags;
601 613
602 local_irq_save(flags);
603 for (z = 0; z < MAX_NR_ZONES; z++) { 614 for (z = 0; z < MAX_NR_ZONES; z++) {
604 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 615 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
605 struct per_cpu_pageset *pset; 616 struct per_cpu_pageset *pset;
@@ -609,11 +620,14 @@ void drain_node_pages(int nodeid)
609 struct per_cpu_pages *pcp; 620 struct per_cpu_pages *pcp;
610 621
611 pcp = &pset->pcp[i]; 622 pcp = &pset->pcp[i];
612 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 623 if (pcp->count) {
613 pcp->count = 0; 624 local_irq_save(flags);
625 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
626 pcp->count = 0;
627 local_irq_restore(flags);
628 }
614 } 629 }
615 } 630 }
616 local_irq_restore(flags);
617} 631}
618#endif 632#endif
619 633
@@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page)
743 free_hot_cold_page(page, 1); 757 free_hot_cold_page(page, 1);
744} 758}
745 759
746static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 760/*
761 * split_page takes a non-compound higher-order page, and splits it into
762 * n (1<<order) sub-pages: page[0..n]
763 * Each sub-page must be freed individually.
764 *
765 * Note: this is probably too low level an operation for use in drivers.
766 * Please consult with lkml before using this in your driver.
767 */
768void split_page(struct page *page, unsigned int order)
747{ 769{
748 int i; 770 int i;
749 771
750 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 772 BUG_ON(PageCompound(page));
751 for(i = 0; i < (1 << order); i++) 773 BUG_ON(!page_count(page));
752 clear_highpage(page + i); 774 for (i = 1; i < (1 << order); i++)
775 set_page_refcounted(page + i);
753} 776}
754 777
755/* 778/*
@@ -795,14 +818,8 @@ again:
795 put_cpu(); 818 put_cpu();
796 819
797 BUG_ON(bad_range(zone, page)); 820 BUG_ON(bad_range(zone, page));
798 if (prep_new_page(page, order)) 821 if (prep_new_page(page, order, gfp_flags))
799 goto again; 822 goto again;
800
801 if (gfp_flags & __GFP_ZERO)
802 prep_zero_page(page, order, gfp_flags);
803
804 if (order && (gfp_flags & __GFP_COMP))
805 prep_compound_page(page, order);
806 return page; 823 return page;
807 824
808failed: 825failed:
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1214 1231
1215static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1232static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1216{ 1233{
1217 int cpu = 0; 1234 unsigned cpu;
1218 1235
1219 memset(ret, 0, nr * sizeof(unsigned long)); 1236 memset(ret, 0, nr * sizeof(unsigned long));
1220 cpus_and(*cpumask, *cpumask, cpu_online_map); 1237 cpus_and(*cpumask, *cpumask, cpu_online_map);
1221 1238
1222 cpu = first_cpu(*cpumask); 1239 for_each_cpu_mask(cpu, *cpumask) {
1223 while (cpu < NR_CPUS) { 1240 unsigned long *in;
1224 unsigned long *in, *out, off; 1241 unsigned long *out;
1225 1242 unsigned off;
1226 if (!cpu_isset(cpu, *cpumask)) 1243 unsigned next_cpu;
1227 continue;
1228 1244
1229 in = (unsigned long *)&per_cpu(page_states, cpu); 1245 in = (unsigned long *)&per_cpu(page_states, cpu);
1230 1246
1231 cpu = next_cpu(cpu, *cpumask); 1247 next_cpu = next_cpu(cpu, *cpumask);
1232 1248 if (likely(next_cpu < NR_CPUS))
1233 if (likely(cpu < NR_CPUS)) 1249 prefetch(&per_cpu(page_states, next_cpu));
1234 prefetch(&per_cpu(page_states, cpu));
1235 1250
1236 out = (unsigned long *)ret; 1251 out = (unsigned long *)ret;
1237 for (off = 0; off < nr; off++) 1252 for (off = 0; off < nr; off++)
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1764 continue; 1779 continue;
1765 page = pfn_to_page(pfn); 1780 page = pfn_to_page(pfn);
1766 set_page_links(page, zone, nid, pfn); 1781 set_page_links(page, zone, nid, pfn);
1767 set_page_count(page, 1); 1782 init_page_count(page);
1768 reset_page_mapcount(page); 1783 reset_page_mapcount(page);
1769 SetPageReserved(page); 1784 SetPageReserved(page);
1770 INIT_LIST_HEAD(&page->lru); 1785 INIT_LIST_HEAD(&page->lru);
diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaaa6296..301b36c4a0ce 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
52 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; 52 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
53} 53}
54 54
55static inline void reset_ahead_window(struct file_ra_state *ra)
56{
57 /*
58 * ... but preserve ahead_start + ahead_size value,
59 * see 'recheck:' label in page_cache_readahead().
60 * Note: We never use ->ahead_size as rvalue without
61 * checking ->ahead_start != 0 first.
62 */
63 ra->ahead_size += ra->ahead_start;
64 ra->ahead_start = 0;
65}
66
55static inline void ra_off(struct file_ra_state *ra) 67static inline void ra_off(struct file_ra_state *ra)
56{ 68{
57 ra->start = 0; 69 ra->start = 0;
58 ra->flags = 0; 70 ra->flags = 0;
59 ra->size = 0; 71 ra->size = 0;
60 ra->ahead_start = 0; 72 reset_ahead_window(ra);
61 ra->ahead_size = 0;
62 return; 73 return;
63} 74}
64 75
@@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
72{ 83{
73 unsigned long newsize = roundup_pow_of_two(size); 84 unsigned long newsize = roundup_pow_of_two(size);
74 85
75 if (newsize <= max / 64) 86 if (newsize <= max / 32)
76 newsize = newsize * newsize; 87 newsize = newsize * 4;
77 else if (newsize <= max / 4) 88 else if (newsize <= max / 4)
78 newsize = max / 4; 89 newsize = newsize * 2;
79 else 90 else
80 newsize = max; 91 newsize = max;
81 return newsize; 92 return newsize;
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
426 * congestion. The ahead window will any way be closed 437 * congestion. The ahead window will any way be closed
427 * in case we failed due to excessive page cache hits. 438 * in case we failed due to excessive page cache hits.
428 */ 439 */
429 ra->ahead_start = 0; 440 reset_ahead_window(ra);
430 ra->ahead_size = 0;
431 } 441 }
432 442
433 return ret; 443 return ret;
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
520 * If we get here we are doing sequential IO and this was not the first 530 * If we get here we are doing sequential IO and this was not the first
521 * occurence (ie we have an existing window) 531 * occurence (ie we have an existing window)
522 */ 532 */
523
524 if (ra->ahead_start == 0) { /* no ahead window yet */ 533 if (ra->ahead_start == 0) { /* no ahead window yet */
525 if (!make_ahead_window(mapping, filp, ra, 0)) 534 if (!make_ahead_window(mapping, filp, ra, 0))
526 goto out; 535 goto recheck;
527 } 536 }
537
528 /* 538 /*
529 * Already have an ahead window, check if we crossed into it. 539 * Already have an ahead window, check if we crossed into it.
530 * If so, shift windows and issue a new ahead window. 540 * If so, shift windows and issue a new ahead window.
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
536 ra->start = ra->ahead_start; 546 ra->start = ra->ahead_start;
537 ra->size = ra->ahead_size; 547 ra->size = ra->ahead_size;
538 make_ahead_window(mapping, filp, ra, 0); 548 make_ahead_window(mapping, filp, ra, 0);
549recheck:
550 /* prev_page shouldn't overrun the ahead window */
551 ra->prev_page = min(ra->prev_page,
552 ra->ahead_start + ra->ahead_size - 1);
539 } 553 }
540 554
541out: 555out:
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b101f..1963e269314d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58 58
59//#define RMAP_DEBUG /* can be enabled only for debugging */ 59struct kmem_cache *anon_vma_cachep;
60
61kmem_cache_t *anon_vma_cachep;
62 60
63static inline void validate_anon_vma(struct vm_area_struct *find_vma) 61static inline void validate_anon_vma(struct vm_area_struct *find_vma)
64{ 62{
65#ifdef RMAP_DEBUG 63#ifdef CONFIG_DEBUG_VM
66 struct anon_vma *anon_vma = find_vma->anon_vma; 64 struct anon_vma *anon_vma = find_vma->anon_vma;
67 struct vm_area_struct *vma; 65 struct vm_area_struct *vma;
68 unsigned int mapcount = 0; 66 unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
166 anon_vma_free(anon_vma); 164 anon_vma_free(anon_vma);
167} 165}
168 166
169static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 167static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
168 unsigned long flags)
170{ 169{
171 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 170 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
172 SLAB_CTOR_CONSTRUCTOR) { 171 SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
550void page_remove_rmap(struct page *page) 549void page_remove_rmap(struct page *page)
551{ 550{
552 if (atomic_add_negative(-1, &page->_mapcount)) { 551 if (atomic_add_negative(-1, &page->_mapcount)) {
553 if (page_mapcount(page) < 0) { 552#ifdef CONFIG_DEBUG_VM
553 if (unlikely(page_mapcount(page) < 0)) {
554 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 554 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
555 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 555 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
556 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 556 printk (KERN_EMERG " page->count = %x\n", page_count(page));
557 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 557 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
558 } 558 }
559 559#endif
560 BUG_ON(page_mapcount(page) < 0); 560 BUG_ON(page_mapcount(page) < 0);
561 /* 561 /*
562 * It would be tidy to reset the PageAnon mapping here, 562 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff7b..37eaf42ed2c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
875} 875}
876 876
877#ifdef CONFIG_NUMA 877#ifdef CONFIG_NUMA
878static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) 878static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
879{ 879{
880 char *nodelist = strchr(value, ':'); 880 char *nodelist = strchr(value, ':');
881 int err = 1; 881 int err = 1;
@@ -2119,7 +2119,7 @@ failed:
2119 return err; 2119 return err;
2120} 2120}
2121 2121
2122static kmem_cache_t *shmem_inode_cachep; 2122static struct kmem_cache *shmem_inode_cachep;
2123 2123
2124static struct inode *shmem_alloc_inode(struct super_block *sb) 2124static struct inode *shmem_alloc_inode(struct super_block *sb)
2125{ 2125{
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
2139 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2139 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2140} 2140}
2141 2141
2142static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) 2142static void init_once(void *foo, struct kmem_cache *cachep,
2143 unsigned long flags)
2143{ 2144{
2144 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2145 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2145 2146
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab04..1c8f5ee230d5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
50 * The head array is strictly LIFO and should improve the cache hit rates. 50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations. 51 * On SMP, it additionally reduces the spinlock operations.
52 * 52 *
53 * The c_cpuarray may not be read with enabled local interrupts - 53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function(). 54 * it's changed with a smp_call_function().
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
@@ -170,12 +170,12 @@
170#if DEBUG 170#if DEBUG
171# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 171# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
172 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 172 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
173 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 173 SLAB_CACHE_DMA | \
174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
176 SLAB_DESTROY_BY_RCU) 176 SLAB_DESTROY_BY_RCU)
177#else 177#else
178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 SLAB_DESTROY_BY_RCU) 181 SLAB_DESTROY_BY_RCU)
@@ -266,16 +266,17 @@ struct array_cache {
266 unsigned int batchcount; 266 unsigned int batchcount;
267 unsigned int touched; 267 unsigned int touched;
268 spinlock_t lock; 268 spinlock_t lock;
269 void *entry[0]; /* 269 void *entry[0]; /*
270 * Must have this definition in here for the proper 270 * Must have this definition in here for the proper
271 * alignment of array_cache. Also simplifies accessing 271 * alignment of array_cache. Also simplifies accessing
272 * the entries. 272 * the entries.
273 * [0] is for gcc 2.95. It should really be []. 273 * [0] is for gcc 2.95. It should really be [].
274 */ 274 */
275}; 275};
276 276
277/* bootstrap: The caches do not work without cpuarrays anymore, 277/*
278 * but the cpuarrays are allocated from the generic caches... 278 * bootstrap: The caches do not work without cpuarrays anymore, but the
279 * cpuarrays are allocated from the generic caches...
279 */ 280 */
280#define BOOT_CPUCACHE_ENTRIES 1 281#define BOOT_CPUCACHE_ENTRIES 1
281struct arraycache_init { 282struct arraycache_init {
@@ -291,13 +292,13 @@ struct kmem_list3 {
291 struct list_head slabs_full; 292 struct list_head slabs_full;
292 struct list_head slabs_free; 293 struct list_head slabs_free;
293 unsigned long free_objects; 294 unsigned long free_objects;
294 unsigned long next_reap;
295 int free_touched;
296 unsigned int free_limit; 295 unsigned int free_limit;
297 unsigned int colour_next; /* Per-node cache coloring */ 296 unsigned int colour_next; /* Per-node cache coloring */
298 spinlock_t list_lock; 297 spinlock_t list_lock;
299 struct array_cache *shared; /* shared per node */ 298 struct array_cache *shared; /* shared per node */
300 struct array_cache **alien; /* on other nodes */ 299 struct array_cache **alien; /* on other nodes */
300 unsigned long next_reap; /* updated without locking */
301 int free_touched; /* updated without locking */
301}; 302};
302 303
303/* 304/*
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
310#define SIZE_L3 (1 + MAX_NUMNODES) 311#define SIZE_L3 (1 + MAX_NUMNODES)
311 312
312/* 313/*
313 * This function must be completely optimized away if 314 * This function must be completely optimized away if a constant is passed to
314 * a constant is passed to it. Mostly the same as 315 * it. Mostly the same as what is in linux/slab.h except it returns an index.
315 * what is in linux/slab.h except it returns an
316 * index.
317 */ 316 */
318static __always_inline int index_of(const size_t size) 317static __always_inline int index_of(const size_t size)
319{ 318{
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
351 parent->free_touched = 0; 350 parent->free_touched = 0;
352} 351}
353 352
354#define MAKE_LIST(cachep, listp, slab, nodeid) \ 353#define MAKE_LIST(cachep, listp, slab, nodeid) \
355 do { \ 354 do { \
356 INIT_LIST_HEAD(listp); \ 355 INIT_LIST_HEAD(listp); \
357 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 356 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
358 } while (0) 357 } while (0)
359 358
360#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 359#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
361 do { \ 360 do { \
362 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
363 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 362 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
364 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 363 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
@@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
373struct kmem_cache { 372struct kmem_cache {
374/* 1) per-cpu data, touched during every alloc/free */ 373/* 1) per-cpu data, touched during every alloc/free */
375 struct array_cache *array[NR_CPUS]; 374 struct array_cache *array[NR_CPUS];
375/* 2) Cache tunables. Protected by cache_chain_mutex */
376 unsigned int batchcount; 376 unsigned int batchcount;
377 unsigned int limit; 377 unsigned int limit;
378 unsigned int shared; 378 unsigned int shared;
379
379 unsigned int buffer_size; 380 unsigned int buffer_size;
380/* 2) touched by every alloc & free from the backend */ 381/* 3) touched by every alloc & free from the backend */
381 struct kmem_list3 *nodelists[MAX_NUMNODES]; 382 struct kmem_list3 *nodelists[MAX_NUMNODES];
382 unsigned int flags; /* constant flags */
383 unsigned int num; /* # of objs per slab */
384 spinlock_t spinlock;
385 383
386/* 3) cache_grow/shrink */ 384 unsigned int flags; /* constant flags */
385 unsigned int num; /* # of objs per slab */
386
387/* 4) cache_grow/shrink */
387 /* order of pgs per slab (2^n) */ 388 /* order of pgs per slab (2^n) */
388 unsigned int gfporder; 389 unsigned int gfporder;
389 390
390 /* force GFP flags, e.g. GFP_DMA */ 391 /* force GFP flags, e.g. GFP_DMA */
391 gfp_t gfpflags; 392 gfp_t gfpflags;
392 393
393 size_t colour; /* cache colouring range */ 394 size_t colour; /* cache colouring range */
394 unsigned int colour_off; /* colour offset */ 395 unsigned int colour_off; /* colour offset */
395 struct kmem_cache *slabp_cache; 396 struct kmem_cache *slabp_cache;
396 unsigned int slab_size; 397 unsigned int slab_size;
397 unsigned int dflags; /* dynamic flags */ 398 unsigned int dflags; /* dynamic flags */
398 399
399 /* constructor func */ 400 /* constructor func */
400 void (*ctor) (void *, struct kmem_cache *, unsigned long); 401 void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +403,11 @@ struct kmem_cache {
402 /* de-constructor func */ 403 /* de-constructor func */
403 void (*dtor) (void *, struct kmem_cache *, unsigned long); 404 void (*dtor) (void *, struct kmem_cache *, unsigned long);
404 405
405/* 4) cache creation/removal */ 406/* 5) cache creation/removal */
406 const char *name; 407 const char *name;
407 struct list_head next; 408 struct list_head next;
408 409
409/* 5) statistics */ 410/* 6) statistics */
410#if STATS 411#if STATS
411 unsigned long num_active; 412 unsigned long num_active;
412 unsigned long num_allocations; 413 unsigned long num_allocations;
@@ -438,8 +439,9 @@ struct kmem_cache {
438#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 439#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
439 440
440#define BATCHREFILL_LIMIT 16 441#define BATCHREFILL_LIMIT 16
441/* Optimization question: fewer reaps means less 442/*
442 * probability for unnessary cpucache drain/refill cycles. 443 * Optimization question: fewer reaps means less probability for unnessary
444 * cpucache drain/refill cycles.
443 * 445 *
444 * OTOH the cpuarrays can contain lots of objects, 446 * OTOH the cpuarrays can contain lots of objects,
445 * which could lock up otherwise freeable slabs. 447 * which could lock up otherwise freeable slabs.
@@ -453,17 +455,19 @@ struct kmem_cache {
453#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 455#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
454#define STATS_INC_GROWN(x) ((x)->grown++) 456#define STATS_INC_GROWN(x) ((x)->grown++)
455#define STATS_INC_REAPED(x) ((x)->reaped++) 457#define STATS_INC_REAPED(x) ((x)->reaped++)
456#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 458#define STATS_SET_HIGH(x) \
457 (x)->high_mark = (x)->num_active; \ 459 do { \
458 } while (0) 460 if ((x)->num_active > (x)->high_mark) \
461 (x)->high_mark = (x)->num_active; \
462 } while (0)
459#define STATS_INC_ERR(x) ((x)->errors++) 463#define STATS_INC_ERR(x) ((x)->errors++)
460#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 464#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
461#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 465#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
462#define STATS_SET_FREEABLE(x, i) \ 466#define STATS_SET_FREEABLE(x, i) \
463 do { if ((x)->max_freeable < i) \ 467 do { \
464 (x)->max_freeable = i; \ 468 if ((x)->max_freeable < i) \
465 } while (0) 469 (x)->max_freeable = i; \
466 470 } while (0)
467#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 471#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
468#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 472#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
469#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 473#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
@@ -478,9 +482,7 @@ struct kmem_cache {
478#define STATS_INC_ERR(x) do { } while (0) 482#define STATS_INC_ERR(x) do { } while (0)
479#define STATS_INC_NODEALLOCS(x) do { } while (0) 483#define STATS_INC_NODEALLOCS(x) do { } while (0)
480#define STATS_INC_NODEFREES(x) do { } while (0) 484#define STATS_INC_NODEFREES(x) do { } while (0)
481#define STATS_SET_FREEABLE(x, i) \ 485#define STATS_SET_FREEABLE(x, i) do { } while (0)
482 do { } while (0)
483
484#define STATS_INC_ALLOCHIT(x) do { } while (0) 486#define STATS_INC_ALLOCHIT(x) do { } while (0)
485#define STATS_INC_ALLOCMISS(x) do { } while (0) 487#define STATS_INC_ALLOCMISS(x) do { } while (0)
486#define STATS_INC_FREEHIT(x) do { } while (0) 488#define STATS_INC_FREEHIT(x) do { } while (0)
@@ -488,7 +490,8 @@ struct kmem_cache {
488#endif 490#endif
489 491
490#if DEBUG 492#if DEBUG
491/* Magic nums for obj red zoning. 493/*
494 * Magic nums for obj red zoning.
492 * Placed in the first word before and the first word after an obj. 495 * Placed in the first word before and the first word after an obj.
493 */ 496 */
494#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 497#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
@@ -499,7 +502,8 @@ struct kmem_cache {
499#define POISON_FREE 0x6b /* for use-after-free poisoning */ 502#define POISON_FREE 0x6b /* for use-after-free poisoning */
500#define POISON_END 0xa5 /* end-byte of poisoning */ 503#define POISON_END 0xa5 /* end-byte of poisoning */
501 504
502/* memory layout of objects: 505/*
506 * memory layout of objects:
503 * 0 : objp 507 * 0 : objp
504 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 508 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
505 * the end of an object is aligned with the end of the real 509 * the end of an object is aligned with the end of the real
@@ -508,7 +512,8 @@ struct kmem_cache {
508 * redzone word. 512 * redzone word.
509 * cachep->obj_offset: The real object. 513 * cachep->obj_offset: The real object.
510 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 514 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
511 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 515 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
516 * [BYTES_PER_WORD long]
512 */ 517 */
513static int obj_offset(struct kmem_cache *cachep) 518static int obj_offset(struct kmem_cache *cachep)
514{ 519{
@@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
552#endif 557#endif
553 558
554/* 559/*
555 * Maximum size of an obj (in 2^order pages) 560 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
556 * and absolute limit for the gfp order. 561 * order.
557 */ 562 */
558#if defined(CONFIG_LARGE_ALLOCS) 563#if defined(CONFIG_LARGE_ALLOCS)
559#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 564#define MAX_OBJ_ORDER 13 /* up to 32Mb */
@@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
573#define BREAK_GFP_ORDER_LO 0 578#define BREAK_GFP_ORDER_LO 0
574static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 579static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
575 580
576/* Functions for storing/retrieving the cachep and or slab from the 581/*
577 * global 'mem_map'. These are used to find the slab an obj belongs to. 582 * Functions for storing/retrieving the cachep and or slab from the page
578 * With kfree(), these are used to find the cache which an obj belongs to. 583 * allocator. These are used to find the slab an obj belongs to. With kfree(),
584 * these are used to find the cache which an obj belongs to.
579 */ 585 */
580static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 586static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
581{ 587{
@@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
584 590
585static inline struct kmem_cache *page_get_cache(struct page *page) 591static inline struct kmem_cache *page_get_cache(struct page *page)
586{ 592{
593 if (unlikely(PageCompound(page)))
594 page = (struct page *)page_private(page);
587 return (struct kmem_cache *)page->lru.next; 595 return (struct kmem_cache *)page->lru.next;
588} 596}
589 597
@@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
594 602
595static inline struct slab *page_get_slab(struct page *page) 603static inline struct slab *page_get_slab(struct page *page)
596{ 604{
605 if (unlikely(PageCompound(page)))
606 page = (struct page *)page_private(page);
597 return (struct slab *)page->lru.prev; 607 return (struct slab *)page->lru.prev;
598} 608}
599 609
@@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj)
609 return page_get_slab(page); 619 return page_get_slab(page);
610} 620}
611 621
612/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 622static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
623 unsigned int idx)
624{
625 return slab->s_mem + cache->buffer_size * idx;
626}
627
628static inline unsigned int obj_to_index(struct kmem_cache *cache,
629 struct slab *slab, void *obj)
630{
631 return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
632}
633
634/*
635 * These are the default caches for kmalloc. Custom caches can have other sizes.
636 */
613struct cache_sizes malloc_sizes[] = { 637struct cache_sizes malloc_sizes[] = {
614#define CACHE(x) { .cs_size = (x) }, 638#define CACHE(x) { .cs_size = (x) },
615#include <linux/kmalloc_sizes.h> 639#include <linux/kmalloc_sizes.h>
@@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = {
642 .limit = BOOT_CPUCACHE_ENTRIES, 666 .limit = BOOT_CPUCACHE_ENTRIES,
643 .shared = 1, 667 .shared = 1,
644 .buffer_size = sizeof(struct kmem_cache), 668 .buffer_size = sizeof(struct kmem_cache),
645 .flags = SLAB_NO_REAP,
646 .spinlock = SPIN_LOCK_UNLOCKED,
647 .name = "kmem_cache", 669 .name = "kmem_cache",
648#if DEBUG 670#if DEBUG
649 .obj_size = sizeof(struct kmem_cache), 671 .obj_size = sizeof(struct kmem_cache),
@@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
655static struct list_head cache_chain; 677static struct list_head cache_chain;
656 678
657/* 679/*
658 * vm_enough_memory() looks at this to determine how many 680 * vm_enough_memory() looks at this to determine how many slab-allocated pages
659 * slab-allocated pages are possibly freeable under pressure 681 * are possibly freeable under pressure
660 * 682 *
661 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 683 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
662 */ 684 */
@@ -675,7 +697,8 @@ static enum {
675 697
676static DEFINE_PER_CPU(struct work_struct, reap_work); 698static DEFINE_PER_CPU(struct work_struct, reap_work);
677 699
678static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); 700static void free_block(struct kmem_cache *cachep, void **objpp, int len,
701 int node);
679static void enable_cpucache(struct kmem_cache *cachep); 702static void enable_cpucache(struct kmem_cache *cachep);
680static void cache_reap(void *unused); 703static void cache_reap(void *unused);
681static int __node_shrink(struct kmem_cache *cachep, int node); 704static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
685 return cachep->array[smp_processor_id()]; 708 return cachep->array[smp_processor_id()];
686} 709}
687 710
688static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) 711static inline struct kmem_cache *__find_general_cachep(size_t size,
712 gfp_t gfpflags)
689{ 713{
690 struct cache_sizes *csizep = malloc_sizes; 714 struct cache_sizes *csizep = malloc_sizes;
691 715
@@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
720 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 744 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
721} 745}
722 746
723/* Calculate the number of objects and left-over bytes for a given 747/*
724 buffer size. */ 748 * Calculate the number of objects and left-over bytes for a given buffer size.
749 */
725static void cache_estimate(unsigned long gfporder, size_t buffer_size, 750static void cache_estimate(unsigned long gfporder, size_t buffer_size,
726 size_t align, int flags, size_t *left_over, 751 size_t align, int flags, size_t *left_over,
727 unsigned int *num) 752 unsigned int *num)
@@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
782 807
783#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 808#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
784 809
785static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) 810static void __slab_error(const char *function, struct kmem_cache *cachep,
811 char *msg)
786{ 812{
787 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 813 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
788 function, cachep->name, msg); 814 function, cachep->name, msg);
@@ -804,7 +830,7 @@ static void init_reap_node(int cpu)
804 830
805 node = next_node(cpu_to_node(cpu), node_online_map); 831 node = next_node(cpu_to_node(cpu), node_online_map);
806 if (node == MAX_NUMNODES) 832 if (node == MAX_NUMNODES)
807 node = 0; 833 node = first_node(node_online_map);
808 834
809 __get_cpu_var(reap_node) = node; 835 __get_cpu_var(reap_node) = node;
810} 836}
@@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
906 932
907 if (!ac_ptr) 933 if (!ac_ptr)
908 return; 934 return;
909
910 for_each_node(i) 935 for_each_node(i)
911 kfree(ac_ptr[i]); 936 kfree(ac_ptr[i]);
912
913 kfree(ac_ptr); 937 kfree(ac_ptr);
914} 938}
915 939
@@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
943 } 967 }
944} 968}
945 969
946static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) 970static void drain_alien_cache(struct kmem_cache *cachep,
971 struct array_cache **alien)
947{ 972{
948 int i = 0; 973 int i = 0;
949 struct array_cache *ac; 974 struct array_cache *ac;
@@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
986 switch (action) { 1011 switch (action) {
987 case CPU_UP_PREPARE: 1012 case CPU_UP_PREPARE:
988 mutex_lock(&cache_chain_mutex); 1013 mutex_lock(&cache_chain_mutex);
989 /* we need to do this right in the beginning since 1014 /*
1015 * We need to do this right in the beginning since
990 * alloc_arraycache's are going to use this list. 1016 * alloc_arraycache's are going to use this list.
991 * kmalloc_node allows us to add the slab to the right 1017 * kmalloc_node allows us to add the slab to the right
992 * kmem_list3 and not this cpu's kmem_list3 1018 * kmem_list3 and not this cpu's kmem_list3
993 */ 1019 */
994 1020
995 list_for_each_entry(cachep, &cache_chain, next) { 1021 list_for_each_entry(cachep, &cache_chain, next) {
996 /* setup the size64 kmemlist for cpu before we can 1022 /*
1023 * Set up the size64 kmemlist for cpu before we can
997 * begin anything. Make sure some other cpu on this 1024 * begin anything. Make sure some other cpu on this
998 * node has not already allocated this 1025 * node has not already allocated this
999 */ 1026 */
1000 if (!cachep->nodelists[node]) { 1027 if (!cachep->nodelists[node]) {
1001 if (!(l3 = kmalloc_node(memsize, 1028 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1002 GFP_KERNEL, node))) 1029 if (!l3)
1003 goto bad; 1030 goto bad;
1004 kmem_list3_init(l3); 1031 kmem_list3_init(l3);
1005 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1032 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1015 1042
1016 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1043 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1017 cachep->nodelists[node]->free_limit = 1044 cachep->nodelists[node]->free_limit =
1018 (1 + nr_cpus_node(node)) * 1045 (1 + nr_cpus_node(node)) *
1019 cachep->batchcount + cachep->num; 1046 cachep->batchcount + cachep->num;
1020 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1047 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1021 } 1048 }
1022 1049
1023 /* Now we can go ahead with allocating the shared array's 1050 /*
1024 & array cache's */ 1051 * Now we can go ahead with allocating the shared arrays and
1052 * array caches
1053 */
1025 list_for_each_entry(cachep, &cache_chain, next) { 1054 list_for_each_entry(cachep, &cache_chain, next) {
1026 struct array_cache *nc; 1055 struct array_cache *nc;
1027 struct array_cache *shared; 1056 struct array_cache *shared;
@@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1041 if (!alien) 1070 if (!alien)
1042 goto bad; 1071 goto bad;
1043 cachep->array[cpu] = nc; 1072 cachep->array[cpu] = nc;
1044
1045 l3 = cachep->nodelists[node]; 1073 l3 = cachep->nodelists[node];
1046 BUG_ON(!l3); 1074 BUG_ON(!l3);
1047 1075
@@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1061 } 1089 }
1062#endif 1090#endif
1063 spin_unlock_irq(&l3->list_lock); 1091 spin_unlock_irq(&l3->list_lock);
1064
1065 kfree(shared); 1092 kfree(shared);
1066 free_alien_cache(alien); 1093 free_alien_cache(alien);
1067 } 1094 }
@@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1083 /* fall thru */ 1110 /* fall thru */
1084 case CPU_UP_CANCELED: 1111 case CPU_UP_CANCELED:
1085 mutex_lock(&cache_chain_mutex); 1112 mutex_lock(&cache_chain_mutex);
1086
1087 list_for_each_entry(cachep, &cache_chain, next) { 1113 list_for_each_entry(cachep, &cache_chain, next) {
1088 struct array_cache *nc; 1114 struct array_cache *nc;
1089 struct array_cache *shared; 1115 struct array_cache *shared;
@@ -1150,7 +1176,7 @@ free_array_cache:
1150#endif 1176#endif
1151 } 1177 }
1152 return NOTIFY_OK; 1178 return NOTIFY_OK;
1153 bad: 1179bad:
1154 mutex_unlock(&cache_chain_mutex); 1180 mutex_unlock(&cache_chain_mutex);
1155 return NOTIFY_BAD; 1181 return NOTIFY_BAD;
1156} 1182}
@@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
1160/* 1186/*
1161 * swap the static kmem_list3 with kmalloced memory 1187 * swap the static kmem_list3 with kmalloced memory
1162 */ 1188 */
1163static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) 1189static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1190 int nodeid)
1164{ 1191{
1165 struct kmem_list3 *ptr; 1192 struct kmem_list3 *ptr;
1166 1193
@@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
1175 local_irq_enable(); 1202 local_irq_enable();
1176} 1203}
1177 1204
1178/* Initialisation. 1205/*
1179 * Called after the gfp() functions have been enabled, and before smp_init(). 1206 * Initialisation. Called after the page allocator have been initialised and
1207 * before smp_init().
1180 */ 1208 */
1181void __init kmem_cache_init(void) 1209void __init kmem_cache_init(void)
1182{ 1210{
@@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void)
1201 1229
1202 /* Bootstrap is tricky, because several objects are allocated 1230 /* Bootstrap is tricky, because several objects are allocated
1203 * from caches that do not exist yet: 1231 * from caches that do not exist yet:
1204 * 1) initialize the cache_cache cache: it contains the struct kmem_cache 1232 * 1) initialize the cache_cache cache: it contains the struct
1205 * structures of all caches, except cache_cache itself: cache_cache 1233 * kmem_cache structures of all caches, except cache_cache itself:
1206 * is statically allocated. 1234 * cache_cache is statically allocated.
1207 * Initially an __init data area is used for the head array and the 1235 * Initially an __init data area is used for the head array and the
1208 * kmem_list3 structures, it's replaced with a kmalloc allocated 1236 * kmem_list3 structures, it's replaced with a kmalloc allocated
1209 * array at the end of the bootstrap. 1237 * array at the end of the bootstrap.
@@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void)
1226 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1254 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1227 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1255 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1228 1256
1229 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); 1257 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1258 cache_line_size());
1230 1259
1231 for (order = 0; order < MAX_ORDER; order++) { 1260 for (order = 0; order < MAX_ORDER; order++) {
1232 cache_estimate(order, cache_cache.buffer_size, 1261 cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void)
1245 sizes = malloc_sizes; 1274 sizes = malloc_sizes;
1246 names = cache_names; 1275 names = cache_names;
1247 1276
1248 /* Initialize the caches that provide memory for the array cache 1277 /*
1249 * and the kmem_list3 structures first. 1278 * Initialize the caches that provide memory for the array cache and the
1250 * Without this, further allocations will bug 1279 * kmem_list3 structures first. Without this, further allocations will
1280 * bug.
1251 */ 1281 */
1252 1282
1253 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1283 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1254 sizes[INDEX_AC].cs_size, 1284 sizes[INDEX_AC].cs_size,
1255 ARCH_KMALLOC_MINALIGN, 1285 ARCH_KMALLOC_MINALIGN,
1256 (ARCH_KMALLOC_FLAGS | 1286 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1257 SLAB_PANIC), NULL, NULL); 1287 NULL, NULL);
1258 1288
1259 if (INDEX_AC != INDEX_L3) 1289 if (INDEX_AC != INDEX_L3) {
1260 sizes[INDEX_L3].cs_cachep = 1290 sizes[INDEX_L3].cs_cachep =
1261 kmem_cache_create(names[INDEX_L3].name, 1291 kmem_cache_create(names[INDEX_L3].name,
1262 sizes[INDEX_L3].cs_size, 1292 sizes[INDEX_L3].cs_size,
1263 ARCH_KMALLOC_MINALIGN, 1293 ARCH_KMALLOC_MINALIGN,
1264 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, 1294 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1265 NULL); 1295 NULL, NULL);
1296 }
1266 1297
1267 while (sizes->cs_size != ULONG_MAX) { 1298 while (sizes->cs_size != ULONG_MAX) {
1268 /* 1299 /*
@@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void)
1272 * Note for systems short on memory removing the alignment will 1303 * Note for systems short on memory removing the alignment will
1273 * allow tighter packing of the smaller caches. 1304 * allow tighter packing of the smaller caches.
1274 */ 1305 */
1275 if (!sizes->cs_cachep) 1306 if (!sizes->cs_cachep) {
1276 sizes->cs_cachep = kmem_cache_create(names->name, 1307 sizes->cs_cachep = kmem_cache_create(names->name,
1277 sizes->cs_size, 1308 sizes->cs_size,
1278 ARCH_KMALLOC_MINALIGN, 1309 ARCH_KMALLOC_MINALIGN,
1279 (ARCH_KMALLOC_FLAGS 1310 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1280 | SLAB_PANIC), 1311 NULL, NULL);
1281 NULL, NULL); 1312 }
1282 1313
1283 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1314 /* Inc off-slab bufctl limit until the ceiling is hit. */
1284 if (!(OFF_SLAB(sizes->cs_cachep))) { 1315 if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void)
1287 } 1318 }
1288 1319
1289 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1320 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1290 sizes->cs_size, 1321 sizes->cs_size,
1291 ARCH_KMALLOC_MINALIGN, 1322 ARCH_KMALLOC_MINALIGN,
1292 (ARCH_KMALLOC_FLAGS | 1323 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1293 SLAB_CACHE_DMA | 1324 SLAB_PANIC,
1294 SLAB_PANIC), NULL, 1325 NULL, NULL);
1295 NULL);
1296
1297 sizes++; 1326 sizes++;
1298 names++; 1327 names++;
1299 } 1328 }
@@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void)
1345 struct kmem_cache *cachep; 1374 struct kmem_cache *cachep;
1346 mutex_lock(&cache_chain_mutex); 1375 mutex_lock(&cache_chain_mutex);
1347 list_for_each_entry(cachep, &cache_chain, next) 1376 list_for_each_entry(cachep, &cache_chain, next)
1348 enable_cpucache(cachep); 1377 enable_cpucache(cachep);
1349 mutex_unlock(&cache_chain_mutex); 1378 mutex_unlock(&cache_chain_mutex);
1350 } 1379 }
1351 1380
1352 /* Done! */ 1381 /* Done! */
1353 g_cpucache_up = FULL; 1382 g_cpucache_up = FULL;
1354 1383
1355 /* Register a cpu startup notifier callback 1384 /*
1356 * that initializes cpu_cache_get for all new cpus 1385 * Register a cpu startup notifier callback that initializes
1386 * cpu_cache_get for all new cpus
1357 */ 1387 */
1358 register_cpu_notifier(&cpucache_notifier); 1388 register_cpu_notifier(&cpucache_notifier);
1359 1389
1360 /* The reap timers are started later, with a module init call: 1390 /*
1361 * That part of the kernel is not yet operational. 1391 * The reap timers are started later, with a module init call: That part
1392 * of the kernel is not yet operational.
1362 */ 1393 */
1363} 1394}
1364 1395
@@ -1366,16 +1397,13 @@ static int __init cpucache_init(void)
1366{ 1397{
1367 int cpu; 1398 int cpu;
1368 1399
1369 /* 1400 /*
1370 * Register the timers that return unneeded 1401 * Register the timers that return unneeded pages to the page allocator
1371 * pages to gfp.
1372 */ 1402 */
1373 for_each_online_cpu(cpu) 1403 for_each_online_cpu(cpu)
1374 start_cpu_timer(cpu); 1404 start_cpu_timer(cpu);
1375
1376 return 0; 1405 return 0;
1377} 1406}
1378
1379__initcall(cpucache_init); 1407__initcall(cpucache_init);
1380 1408
1381/* 1409/*
@@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1402 atomic_add(i, &slab_reclaim_pages); 1430 atomic_add(i, &slab_reclaim_pages);
1403 add_page_state(nr_slab, i); 1431 add_page_state(nr_slab, i);
1404 while (i--) { 1432 while (i--) {
1405 SetPageSlab(page); 1433 __SetPageSlab(page);
1406 page++; 1434 page++;
1407 } 1435 }
1408 return addr; 1436 return addr;
@@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1418 const unsigned long nr_freed = i; 1446 const unsigned long nr_freed = i;
1419 1447
1420 while (i--) { 1448 while (i--) {
1421 if (!TestClearPageSlab(page)) 1449 BUG_ON(!PageSlab(page));
1422 BUG(); 1450 __ClearPageSlab(page);
1423 page++; 1451 page++;
1424 } 1452 }
1425 sub_page_state(nr_slab, nr_freed); 1453 sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit)
1489{ 1517{
1490 int i; 1518 int i;
1491 printk(KERN_ERR "%03x:", offset); 1519 printk(KERN_ERR "%03x:", offset);
1492 for (i = 0; i < limit; i++) { 1520 for (i = 0; i < limit; i++)
1493 printk(" %02x", (unsigned char)data[offset + i]); 1521 printk(" %02x", (unsigned char)data[offset + i]);
1494 }
1495 printk("\n"); 1522 printk("\n");
1496} 1523}
1497#endif 1524#endif
@@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1505 1532
1506 if (cachep->flags & SLAB_RED_ZONE) { 1533 if (cachep->flags & SLAB_RED_ZONE) {
1507 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1534 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1508 *dbg_redzone1(cachep, objp), 1535 *dbg_redzone1(cachep, objp),
1509 *dbg_redzone2(cachep, objp)); 1536 *dbg_redzone2(cachep, objp));
1510 } 1537 }
1511 1538
1512 if (cachep->flags & SLAB_STORE_USER) { 1539 if (cachep->flags & SLAB_STORE_USER) {
1513 printk(KERN_ERR "Last user: [<%p>]", 1540 printk(KERN_ERR "Last user: [<%p>]",
1514 *dbg_userword(cachep, objp)); 1541 *dbg_userword(cachep, objp));
1515 print_symbol("(%s)", 1542 print_symbol("(%s)",
1516 (unsigned long)*dbg_userword(cachep, objp)); 1543 (unsigned long)*dbg_userword(cachep, objp));
1517 printk("\n"); 1544 printk("\n");
1518 } 1545 }
1519 realobj = (char *)objp + obj_offset(cachep); 1546 realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1546 /* Print header */ 1573 /* Print header */
1547 if (lines == 0) { 1574 if (lines == 0) {
1548 printk(KERN_ERR 1575 printk(KERN_ERR
1549 "Slab corruption: start=%p, len=%d\n", 1576 "Slab corruption: start=%p, len=%d\n",
1550 realobj, size); 1577 realobj, size);
1551 print_objinfo(cachep, objp, 0); 1578 print_objinfo(cachep, objp, 0);
1552 } 1579 }
1553 /* Hexdump the affected line */ 1580 /* Hexdump the affected line */
@@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1568 * exist: 1595 * exist:
1569 */ 1596 */
1570 struct slab *slabp = virt_to_slab(objp); 1597 struct slab *slabp = virt_to_slab(objp);
1571 int objnr; 1598 unsigned int objnr;
1572 1599
1573 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 1600 objnr = obj_to_index(cachep, slabp, objp);
1574 if (objnr) { 1601 if (objnr) {
1575 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; 1602 objp = index_to_obj(cachep, slabp, objnr - 1);
1576 realobj = (char *)objp + obj_offset(cachep); 1603 realobj = (char *)objp + obj_offset(cachep);
1577 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1604 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1578 realobj, size); 1605 realobj, size);
1579 print_objinfo(cachep, objp, 2); 1606 print_objinfo(cachep, objp, 2);
1580 } 1607 }
1581 if (objnr + 1 < cachep->num) { 1608 if (objnr + 1 < cachep->num) {
1582 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; 1609 objp = index_to_obj(cachep, slabp, objnr + 1);
1583 realobj = (char *)objp + obj_offset(cachep); 1610 realobj = (char *)objp + obj_offset(cachep);
1584 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1611 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1585 realobj, size); 1612 realobj, size);
@@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1591 1618
1592#if DEBUG 1619#if DEBUG
1593/** 1620/**
1594 * slab_destroy_objs - call the registered destructor for each object in 1621 * slab_destroy_objs - destroy a slab and its objects
1595 * a slab that is to be destroyed. 1622 * @cachep: cache pointer being destroyed
1623 * @slabp: slab pointer being destroyed
1624 *
1625 * Call the registered destructor for each object in a slab that is being
1626 * destroyed.
1596 */ 1627 */
1597static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1628static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1598{ 1629{
1599 int i; 1630 int i;
1600 for (i = 0; i < cachep->num; i++) { 1631 for (i = 0; i < cachep->num; i++) {
1601 void *objp = slabp->s_mem + cachep->buffer_size * i; 1632 void *objp = index_to_obj(cachep, slabp, i);
1602 1633
1603 if (cachep->flags & SLAB_POISON) { 1634 if (cachep->flags & SLAB_POISON) {
1604#ifdef CONFIG_DEBUG_PAGEALLOC 1635#ifdef CONFIG_DEBUG_PAGEALLOC
1605 if ((cachep->buffer_size % PAGE_SIZE) == 0 1636 if (cachep->buffer_size % PAGE_SIZE == 0 &&
1606 && OFF_SLAB(cachep)) 1637 OFF_SLAB(cachep))
1607 kernel_map_pages(virt_to_page(objp), 1638 kernel_map_pages(virt_to_page(objp),
1608 cachep->buffer_size / PAGE_SIZE, 1639 cachep->buffer_size / PAGE_SIZE, 1);
1609 1);
1610 else 1640 else
1611 check_poison_obj(cachep, objp); 1641 check_poison_obj(cachep, objp);
1612#else 1642#else
@@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1631 if (cachep->dtor) { 1661 if (cachep->dtor) {
1632 int i; 1662 int i;
1633 for (i = 0; i < cachep->num; i++) { 1663 for (i = 0; i < cachep->num; i++) {
1634 void *objp = slabp->s_mem + cachep->buffer_size * i; 1664 void *objp = index_to_obj(cachep, slabp, i);
1635 (cachep->dtor) (objp, cachep, 0); 1665 (cachep->dtor) (objp, cachep, 0);
1636 } 1666 }
1637 } 1667 }
@@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1639#endif 1669#endif
1640 1670
1641/** 1671/**
1672 * slab_destroy - destroy and release all objects in a slab
1673 * @cachep: cache pointer being destroyed
1674 * @slabp: slab pointer being destroyed
1675 *
1642 * Destroy all the objs in a slab, and release the mem back to the system. 1676 * Destroy all the objs in a slab, and release the mem back to the system.
1643 * Before calling the slab must have been unlinked from the cache. 1677 * Before calling the slab must have been unlinked from the cache. The
1644 * The cache-lock is not held/needed. 1678 * cache-lock is not held/needed.
1645 */ 1679 */
1646static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1680static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1647{ 1681{
@@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1662 } 1696 }
1663} 1697}
1664 1698
1665/* For setting up all the kmem_list3s for cache whose buffer_size is same 1699/*
1666 as size of kmem_list3. */ 1700 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1701 * size of kmem_list3.
1702 */
1667static void set_up_list3s(struct kmem_cache *cachep, int index) 1703static void set_up_list3s(struct kmem_cache *cachep, int index)
1668{ 1704{
1669 int node; 1705 int node;
@@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1689 * high order pages for slabs. When the gfp() functions are more friendly 1725 * high order pages for slabs. When the gfp() functions are more friendly
1690 * towards high-order requests, this should be changed. 1726 * towards high-order requests, this should be changed.
1691 */ 1727 */
1692static inline size_t calculate_slab_order(struct kmem_cache *cachep, 1728static size_t calculate_slab_order(struct kmem_cache *cachep,
1693 size_t size, size_t align, unsigned long flags) 1729 size_t size, size_t align, unsigned long flags)
1694{ 1730{
1695 size_t left_over = 0; 1731 size_t left_over = 0;
1696 int gfporder; 1732 int gfporder;
1697 1733
1698 for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { 1734 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1699 unsigned int num; 1735 unsigned int num;
1700 size_t remainder; 1736 size_t remainder;
1701 1737
@@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1730 /* 1766 /*
1731 * Acceptable internal fragmentation? 1767 * Acceptable internal fragmentation?
1732 */ 1768 */
1733 if ((left_over * 8) <= (PAGE_SIZE << gfporder)) 1769 if (left_over * 8 <= (PAGE_SIZE << gfporder))
1734 break; 1770 break;
1735 } 1771 }
1736 return left_over; 1772 return left_over;
1737} 1773}
1738 1774
1775static void setup_cpu_cache(struct kmem_cache *cachep)
1776{
1777 if (g_cpucache_up == FULL) {
1778 enable_cpucache(cachep);
1779 return;
1780 }
1781 if (g_cpucache_up == NONE) {
1782 /*
1783 * Note: the first kmem_cache_create must create the cache
1784 * that's used by kmalloc(24), otherwise the creation of
1785 * further caches will BUG().
1786 */
1787 cachep->array[smp_processor_id()] = &initarray_generic.cache;
1788
1789 /*
1790 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
1791 * the first cache, then we need to set up all its list3s,
1792 * otherwise the creation of further caches will BUG().
1793 */
1794 set_up_list3s(cachep, SIZE_AC);
1795 if (INDEX_AC == INDEX_L3)
1796 g_cpucache_up = PARTIAL_L3;
1797 else
1798 g_cpucache_up = PARTIAL_AC;
1799 } else {
1800 cachep->array[smp_processor_id()] =
1801 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1802
1803 if (g_cpucache_up == PARTIAL_AC) {
1804 set_up_list3s(cachep, SIZE_L3);
1805 g_cpucache_up = PARTIAL_L3;
1806 } else {
1807 int node;
1808 for_each_online_node(node) {
1809 cachep->nodelists[node] =
1810 kmalloc_node(sizeof(struct kmem_list3),
1811 GFP_KERNEL, node);
1812 BUG_ON(!cachep->nodelists[node]);
1813 kmem_list3_init(cachep->nodelists[node]);
1814 }
1815 }
1816 }
1817 cachep->nodelists[numa_node_id()]->next_reap =
1818 jiffies + REAPTIMEOUT_LIST3 +
1819 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1820
1821 cpu_cache_get(cachep)->avail = 0;
1822 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1823 cpu_cache_get(cachep)->batchcount = 1;
1824 cpu_cache_get(cachep)->touched = 0;
1825 cachep->batchcount = 1;
1826 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1827}
1828
1739/** 1829/**
1740 * kmem_cache_create - Create a cache. 1830 * kmem_cache_create - Create a cache.
1741 * @name: A string which is used in /proc/slabinfo to identify this cache. 1831 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1751 * and the @dtor is run before the pages are handed back. 1841 * and the @dtor is run before the pages are handed back.
1752 * 1842 *
1753 * @name must be valid until the cache is destroyed. This implies that 1843 * @name must be valid until the cache is destroyed. This implies that
1754 * the module calling this has to destroy the cache before getting 1844 * the module calling this has to destroy the cache before getting unloaded.
1755 * unloaded. 1845 *
1756 *
1757 * The flags are 1846 * The flags are
1758 * 1847 *
1759 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1848 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1762 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1851 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1763 * for buffer overruns. 1852 * for buffer overruns.
1764 * 1853 *
1765 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1766 * memory pressure.
1767 *
1768 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1854 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1769 * cacheline. This can be beneficial if you're counting cycles as closely 1855 * cacheline. This can be beneficial if you're counting cycles as closely
1770 * as davem. 1856 * as davem.
1771 */ 1857 */
1772struct kmem_cache * 1858struct kmem_cache *
1773kmem_cache_create (const char *name, size_t size, size_t align, 1859kmem_cache_create (const char *name, size_t size, size_t align,
1774 unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), 1860 unsigned long flags,
1861 void (*ctor)(void*, struct kmem_cache *, unsigned long),
1775 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 1862 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1776{ 1863{
1777 size_t left_over, slab_size, ralign; 1864 size_t left_over, slab_size, ralign;
@@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1781 /* 1868 /*
1782 * Sanity checks... these are all serious usage bugs. 1869 * Sanity checks... these are all serious usage bugs.
1783 */ 1870 */
1784 if ((!name) || 1871 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
1785 in_interrupt() ||
1786 (size < BYTES_PER_WORD) ||
1787 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 1872 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1788 printk(KERN_ERR "%s: Early error in slab %s\n", 1873 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
1789 __FUNCTION__, name); 1874 name);
1790 BUG(); 1875 BUG();
1791 } 1876 }
1792 1877
@@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1840 * above the next power of two: caches with object sizes just above a 1925 * above the next power of two: caches with object sizes just above a
1841 * power of two have a significant amount of internal fragmentation. 1926 * power of two have a significant amount of internal fragmentation.
1842 */ 1927 */
1843 if ((size < 4096 1928 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
1844 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1845 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 1929 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1846 if (!(flags & SLAB_DESTROY_BY_RCU)) 1930 if (!(flags & SLAB_DESTROY_BY_RCU))
1847 flags |= SLAB_POISON; 1931 flags |= SLAB_POISON;
@@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1853 BUG_ON(dtor); 1937 BUG_ON(dtor);
1854 1938
1855 /* 1939 /*
1856 * Always checks flags, a caller might be expecting debug 1940 * Always checks flags, a caller might be expecting debug support which
1857 * support which isn't available. 1941 * isn't available.
1858 */ 1942 */
1859 if (flags & ~CREATE_MASK) 1943 if (flags & ~CREATE_MASK)
1860 BUG(); 1944 BUG();
1861 1945
1862 /* Check that size is in terms of words. This is needed to avoid 1946 /*
1947 * Check that size is in terms of words. This is needed to avoid
1863 * unaligned accesses for some archs when redzoning is used, and makes 1948 * unaligned accesses for some archs when redzoning is used, and makes
1864 * sure any on-slab bufctl's are also correctly aligned. 1949 * sure any on-slab bufctl's are also correctly aligned.
1865 */ 1950 */
@@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1868 size &= ~(BYTES_PER_WORD - 1); 1953 size &= ~(BYTES_PER_WORD - 1);
1869 } 1954 }
1870 1955
1871 /* calculate out the final buffer alignment: */ 1956 /* calculate the final buffer alignment: */
1957
1872 /* 1) arch recommendation: can be overridden for debug */ 1958 /* 1) arch recommendation: can be overridden for debug */
1873 if (flags & SLAB_HWCACHE_ALIGN) { 1959 if (flags & SLAB_HWCACHE_ALIGN) {
1874 /* Default alignment: as specified by the arch code. 1960 /*
1875 * Except if an object is really small, then squeeze multiple 1961 * Default alignment: as specified by the arch code. Except if
1876 * objects into one cacheline. 1962 * an object is really small, then squeeze multiple objects into
1963 * one cacheline.
1877 */ 1964 */
1878 ralign = cache_line_size(); 1965 ralign = cache_line_size();
1879 while (size <= ralign / 2) 1966 while (size <= ralign / 2)
@@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1893 if (ralign > BYTES_PER_WORD) 1980 if (ralign > BYTES_PER_WORD)
1894 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1981 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1895 } 1982 }
1896 /* 4) Store it. Note that the debug code below can reduce 1983 /*
1984 * 4) Store it. Note that the debug code below can reduce
1897 * the alignment to BYTES_PER_WORD. 1985 * the alignment to BYTES_PER_WORD.
1898 */ 1986 */
1899 align = ralign; 1987 align = ralign;
@@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978 cachep->gfpflags = 0; 2066 cachep->gfpflags = 0;
1979 if (flags & SLAB_CACHE_DMA) 2067 if (flags & SLAB_CACHE_DMA)
1980 cachep->gfpflags |= GFP_DMA; 2068 cachep->gfpflags |= GFP_DMA;
1981 spin_lock_init(&cachep->spinlock);
1982 cachep->buffer_size = size; 2069 cachep->buffer_size = size;
1983 2070
1984 if (flags & CFLGS_OFF_SLAB) 2071 if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1988 cachep->name = name; 2075 cachep->name = name;
1989 2076
1990 2077
1991 if (g_cpucache_up == FULL) { 2078 setup_cpu_cache(cachep);
1992 enable_cpucache(cachep);
1993 } else {
1994 if (g_cpucache_up == NONE) {
1995 /* Note: the first kmem_cache_create must create
1996 * the cache that's used by kmalloc(24), otherwise
1997 * the creation of further caches will BUG().
1998 */
1999 cachep->array[smp_processor_id()] =
2000 &initarray_generic.cache;
2001
2002 /* If the cache that's used by
2003 * kmalloc(sizeof(kmem_list3)) is the first cache,
2004 * then we need to set up all its list3s, otherwise
2005 * the creation of further caches will BUG().
2006 */
2007 set_up_list3s(cachep, SIZE_AC);
2008 if (INDEX_AC == INDEX_L3)
2009 g_cpucache_up = PARTIAL_L3;
2010 else
2011 g_cpucache_up = PARTIAL_AC;
2012 } else {
2013 cachep->array[smp_processor_id()] =
2014 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2015
2016 if (g_cpucache_up == PARTIAL_AC) {
2017 set_up_list3s(cachep, SIZE_L3);
2018 g_cpucache_up = PARTIAL_L3;
2019 } else {
2020 int node;
2021 for_each_online_node(node) {
2022
2023 cachep->nodelists[node] =
2024 kmalloc_node(sizeof
2025 (struct kmem_list3),
2026 GFP_KERNEL, node);
2027 BUG_ON(!cachep->nodelists[node]);
2028 kmem_list3_init(cachep->
2029 nodelists[node]);
2030 }
2031 }
2032 }
2033 cachep->nodelists[numa_node_id()]->next_reap =
2034 jiffies + REAPTIMEOUT_LIST3 +
2035 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2036
2037 BUG_ON(!cpu_cache_get(cachep));
2038 cpu_cache_get(cachep)->avail = 0;
2039 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2040 cpu_cache_get(cachep)->batchcount = 1;
2041 cpu_cache_get(cachep)->touched = 0;
2042 cachep->batchcount = 1;
2043 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2044 }
2045 2079
2046 /* cache setup completed, link it into the list */ 2080 /* cache setup completed, link it into the list */
2047 list_add(&cachep->next, &cache_chain); 2081 list_add(&cachep->next, &cache_chain);
2048 oops: 2082oops:
2049 if (!cachep && (flags & SLAB_PANIC)) 2083 if (!cachep && (flags & SLAB_PANIC))
2050 panic("kmem_cache_create(): failed to create slab `%s'\n", 2084 panic("kmem_cache_create(): failed to create slab `%s'\n",
2051 name); 2085 name);
@@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2089#define check_spinlock_acquired_node(x, y) do { } while(0) 2123#define check_spinlock_acquired_node(x, y) do { } while(0)
2090#endif 2124#endif
2091 2125
2092/* 2126static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2093 * Waits for all CPUs to execute func(). 2127 struct array_cache *ac,
2094 */ 2128 int force, int node);
2095static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
2096{
2097 check_irq_on();
2098 preempt_disable();
2099
2100 local_irq_disable();
2101 func(arg);
2102 local_irq_enable();
2103
2104 if (smp_call_function(func, arg, 1, 1))
2105 BUG();
2106
2107 preempt_enable();
2108}
2109
2110static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
2111 int force, int node);
2112 2129
2113static void do_drain(void *arg) 2130static void do_drain(void *arg)
2114{ 2131{
2115 struct kmem_cache *cachep = (struct kmem_cache *) arg; 2132 struct kmem_cache *cachep = arg;
2116 struct array_cache *ac; 2133 struct array_cache *ac;
2117 int node = numa_node_id(); 2134 int node = numa_node_id();
2118 2135
@@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2129 struct kmem_list3 *l3; 2146 struct kmem_list3 *l3;
2130 int node; 2147 int node;
2131 2148
2132 smp_call_function_all_cpus(do_drain, cachep); 2149 on_each_cpu(do_drain, cachep, 1, 1);
2133 check_irq_on(); 2150 check_irq_on();
2134 for_each_online_node(node) { 2151 for_each_online_node(node) {
2135 l3 = cachep->nodelists[node]; 2152 l3 = cachep->nodelists[node];
2136 if (l3) { 2153 if (l3) {
2137 spin_lock_irq(&l3->list_lock); 2154 drain_array(cachep, l3, l3->shared, 1, node);
2138 drain_array_locked(cachep, l3->shared, 1, node);
2139 spin_unlock_irq(&l3->list_lock);
2140 if (l3->alien) 2155 if (l3->alien)
2141 drain_alien_cache(cachep, l3->alien); 2156 drain_alien_cache(cachep, l3->alien);
2142 } 2157 }
@@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2260 2275
2261 /* NUMA: free the list3 structures */ 2276 /* NUMA: free the list3 structures */
2262 for_each_online_node(i) { 2277 for_each_online_node(i) {
2263 if ((l3 = cachep->nodelists[i])) { 2278 l3 = cachep->nodelists[i];
2279 if (l3) {
2264 kfree(l3->shared); 2280 kfree(l3->shared);
2265 free_alien_cache(l3->alien); 2281 free_alien_cache(l3->alien);
2266 kfree(l3); 2282 kfree(l3);
2267 } 2283 }
2268 } 2284 }
2269 kmem_cache_free(&cache_cache, cachep); 2285 kmem_cache_free(&cache_cache, cachep);
2270
2271 unlock_cpu_hotplug(); 2286 unlock_cpu_hotplug();
2272
2273 return 0; 2287 return 0;
2274} 2288}
2275EXPORT_SYMBOL(kmem_cache_destroy); 2289EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2292 slabp->inuse = 0; 2306 slabp->inuse = 0;
2293 slabp->colouroff = colour_off; 2307 slabp->colouroff = colour_off;
2294 slabp->s_mem = objp + colour_off; 2308 slabp->s_mem = objp + colour_off;
2295
2296 return slabp; 2309 return slabp;
2297} 2310}
2298 2311
@@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2307 int i; 2320 int i;
2308 2321
2309 for (i = 0; i < cachep->num; i++) { 2322 for (i = 0; i < cachep->num; i++) {
2310 void *objp = slabp->s_mem + cachep->buffer_size * i; 2323 void *objp = index_to_obj(cachep, slabp, i);
2311#if DEBUG 2324#if DEBUG
2312 /* need to poison the objs? */ 2325 /* need to poison the objs? */
2313 if (cachep->flags & SLAB_POISON) 2326 if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
2320 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2333 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2321 } 2334 }
2322 /* 2335 /*
2323 * Constructors are not allowed to allocate memory from 2336 * Constructors are not allowed to allocate memory from the same
2324 * the same cache which they are a constructor for. 2337 * cache which they are a constructor for. Otherwise, deadlock.
2325 * Otherwise, deadlock. They must also be threaded. 2338 * They must also be threaded.
2326 */ 2339 */
2327 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2340 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2328 cachep->ctor(objp + obj_offset(cachep), cachep, 2341 cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
2336 slab_error(cachep, "constructor overwrote the" 2349 slab_error(cachep, "constructor overwrote the"
2337 " start of an object"); 2350 " start of an object");
2338 } 2351 }
2339 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2352 if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2340 && cachep->flags & SLAB_POISON) 2353 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2341 kernel_map_pages(virt_to_page(objp), 2354 kernel_map_pages(virt_to_page(objp),
2342 cachep->buffer_size / PAGE_SIZE, 0); 2355 cachep->buffer_size / PAGE_SIZE, 0);
2343#else 2356#else
@@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
2352 2365
2353static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2366static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2354{ 2367{
2355 if (flags & SLAB_DMA) { 2368 if (flags & SLAB_DMA)
2356 if (!(cachep->gfpflags & GFP_DMA)) 2369 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2357 BUG(); 2370 else
2358 } else { 2371 BUG_ON(cachep->gfpflags & GFP_DMA);
2359 if (cachep->gfpflags & GFP_DMA)
2360 BUG();
2361 }
2362} 2372}
2363 2373
2364static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) 2374static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2375 int nodeid)
2365{ 2376{
2366 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); 2377 void *objp = index_to_obj(cachep, slabp, slabp->free);
2367 kmem_bufctl_t next; 2378 kmem_bufctl_t next;
2368 2379
2369 slabp->inuse++; 2380 slabp->inuse++;
@@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
2377 return objp; 2388 return objp;
2378} 2389}
2379 2390
2380static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, 2391static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2381 int nodeid) 2392 void *objp, int nodeid)
2382{ 2393{
2383 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; 2394 unsigned int objnr = obj_to_index(cachep, slabp, objp);
2384 2395
2385#if DEBUG 2396#if DEBUG
2386 /* Verify that the slab belongs to the intended node */ 2397 /* Verify that the slab belongs to the intended node */
@@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
2388 2399
2389 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2400 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2390 printk(KERN_ERR "slab: double free detected in cache " 2401 printk(KERN_ERR "slab: double free detected in cache "
2391 "'%s', objp %p\n", cachep->name, objp); 2402 "'%s', objp %p\n", cachep->name, objp);
2392 BUG(); 2403 BUG();
2393 } 2404 }
2394#endif 2405#endif
@@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
2397 slabp->inuse--; 2408 slabp->inuse--;
2398} 2409}
2399 2410
2400static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) 2411static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
2412 void *objp)
2401{ 2413{
2402 int i; 2414 int i;
2403 struct page *page; 2415 struct page *page;
2404 2416
2405 /* Nasty!!!!!! I hope this is OK. */ 2417 /* Nasty!!!!!! I hope this is OK. */
2406 i = 1 << cachep->gfporder;
2407 page = virt_to_page(objp); 2418 page = virt_to_page(objp);
2419
2420 i = 1;
2421 if (likely(!PageCompound(page)))
2422 i <<= cachep->gfporder;
2408 do { 2423 do {
2409 page_set_cache(page, cachep); 2424 page_set_cache(page, cachep);
2410 page_set_slab(page, slabp); 2425 page_set_slab(page, slabp);
@@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2425 unsigned long ctor_flags; 2440 unsigned long ctor_flags;
2426 struct kmem_list3 *l3; 2441 struct kmem_list3 *l3;
2427 2442
2428 /* Be lazy and only check for valid flags here, 2443 /*
2429 * keeping it out of the critical path in kmem_cache_alloc(). 2444 * Be lazy and only check for valid flags here, keeping it out of the
2445 * critical path in kmem_cache_alloc().
2430 */ 2446 */
2431 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) 2447 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2432 BUG(); 2448 BUG();
@@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2467 */ 2483 */
2468 kmem_flagcheck(cachep, flags); 2484 kmem_flagcheck(cachep, flags);
2469 2485
2470 /* Get mem for the objs. 2486 /*
2471 * Attempt to allocate a physical page from 'nodeid', 2487 * Get mem for the objs. Attempt to allocate a physical page from
2488 * 'nodeid'.
2472 */ 2489 */
2473 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2490 objp = kmem_getpages(cachep, flags, nodeid);
2491 if (!objp)
2474 goto failed; 2492 goto failed;
2475 2493
2476 /* Get slab management. */ 2494 /* Get slab management. */
2477 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2495 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
2496 if (!slabp)
2478 goto opps1; 2497 goto opps1;
2479 2498
2480 slabp->nodeid = nodeid; 2499 slabp->nodeid = nodeid;
@@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2493 l3->free_objects += cachep->num; 2512 l3->free_objects += cachep->num;
2494 spin_unlock(&l3->list_lock); 2513 spin_unlock(&l3->list_lock);
2495 return 1; 2514 return 1;
2496 opps1: 2515opps1:
2497 kmem_freepages(cachep, objp); 2516 kmem_freepages(cachep, objp);
2498 failed: 2517failed:
2499 if (local_flags & __GFP_WAIT) 2518 if (local_flags & __GFP_WAIT)
2500 local_irq_disable(); 2519 local_irq_disable();
2501 return 0; 2520 return 0;
@@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2538 page = virt_to_page(objp); 2557 page = virt_to_page(objp);
2539 2558
2540 if (page_get_cache(page) != cachep) { 2559 if (page_get_cache(page) != cachep) {
2541 printk(KERN_ERR 2560 printk(KERN_ERR "mismatch in kmem_cache_free: expected "
2542 "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2561 "cache %p, got %p\n",
2543 page_get_cache(page), cachep); 2562 page_get_cache(page), cachep);
2544 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2563 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2545 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), 2564 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2549 slabp = page_get_slab(page); 2568 slabp = page_get_slab(page);
2550 2569
2551 if (cachep->flags & SLAB_RED_ZONE) { 2570 if (cachep->flags & SLAB_RED_ZONE) {
2552 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE 2571 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
2553 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2572 *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2554 slab_error(cachep, 2573 slab_error(cachep, "double free, or memory outside"
2555 "double free, or memory outside" 2574 " object was overwritten");
2556 " object was overwritten"); 2575 printk(KERN_ERR "%p: redzone 1:0x%lx, "
2557 printk(KERN_ERR 2576 "redzone 2:0x%lx.\n",
2558 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2559 objp, *dbg_redzone1(cachep, objp), 2577 objp, *dbg_redzone1(cachep, objp),
2560 *dbg_redzone2(cachep, objp)); 2578 *dbg_redzone2(cachep, objp));
2561 } 2579 }
@@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2565 if (cachep->flags & SLAB_STORE_USER) 2583 if (cachep->flags & SLAB_STORE_USER)
2566 *dbg_userword(cachep, objp) = caller; 2584 *dbg_userword(cachep, objp) = caller;
2567 2585
2568 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 2586 objnr = obj_to_index(cachep, slabp, objp);
2569 2587
2570 BUG_ON(objnr >= cachep->num); 2588 BUG_ON(objnr >= cachep->num);
2571 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); 2589 BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2572 2590
2573 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2591 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2574 /* Need to call the slab's constructor so the 2592 /*
2575 * caller can perform a verify of its state (debugging). 2593 * Need to call the slab's constructor so the caller can
2576 * Called without the cache-lock held. 2594 * perform a verify of its state (debugging). Called without
2595 * the cache-lock held.
2577 */ 2596 */
2578 cachep->ctor(objp + obj_offset(cachep), 2597 cachep->ctor(objp + obj_offset(cachep),
2579 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2598 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2586 } 2605 }
2587 if (cachep->flags & SLAB_POISON) { 2606 if (cachep->flags & SLAB_POISON) {
2588#ifdef CONFIG_DEBUG_PAGEALLOC 2607#ifdef CONFIG_DEBUG_PAGEALLOC
2589 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2608 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2590 store_stackinfo(cachep, objp, (unsigned long)caller); 2609 store_stackinfo(cachep, objp, (unsigned long)caller);
2591 kernel_map_pages(virt_to_page(objp), 2610 kernel_map_pages(virt_to_page(objp),
2592 cachep->buffer_size / PAGE_SIZE, 0); 2611 cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2612 goto bad; 2631 goto bad;
2613 } 2632 }
2614 if (entries != cachep->num - slabp->inuse) { 2633 if (entries != cachep->num - slabp->inuse) {
2615 bad: 2634bad:
2616 printk(KERN_ERR 2635 printk(KERN_ERR "slab: Internal list corruption detected in "
2617 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2636 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2618 cachep->name, cachep->num, slabp, slabp->inuse); 2637 cachep->name, cachep->num, slabp, slabp->inuse);
2619 for (i = 0; 2638 for (i = 0;
2620 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2639 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2621 i++) { 2640 i++) {
2622 if ((i % 16) == 0) 2641 if (i % 16 == 0)
2623 printk("\n%03x:", i); 2642 printk("\n%03x:", i);
2624 printk(" %02x", ((unsigned char *)slabp)[i]); 2643 printk(" %02x", ((unsigned char *)slabp)[i]);
2625 } 2644 }
@@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2641 2660
2642 check_irq_off(); 2661 check_irq_off();
2643 ac = cpu_cache_get(cachep); 2662 ac = cpu_cache_get(cachep);
2644 retry: 2663retry:
2645 batchcount = ac->batchcount; 2664 batchcount = ac->batchcount;
2646 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2665 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2647 /* if there was little recent activity on this 2666 /*
2648 * cache, then perform only a partial refill. 2667 * If there was little recent activity on this cache, then
2649 * Otherwise we could generate refill bouncing. 2668 * perform only a partial refill. Otherwise we could generate
2669 * refill bouncing.
2650 */ 2670 */
2651 batchcount = BATCHREFILL_LIMIT; 2671 batchcount = BATCHREFILL_LIMIT;
2652 } 2672 }
@@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2702 list_add(&slabp->list, &l3->slabs_partial); 2722 list_add(&slabp->list, &l3->slabs_partial);
2703 } 2723 }
2704 2724
2705 must_grow: 2725must_grow:
2706 l3->free_objects -= ac->avail; 2726 l3->free_objects -= ac->avail;
2707 alloc_done: 2727alloc_done:
2708 spin_unlock(&l3->list_lock); 2728 spin_unlock(&l3->list_lock);
2709 2729
2710 if (unlikely(!ac->avail)) { 2730 if (unlikely(!ac->avail)) {
2711 int x; 2731 int x;
2712 x = cache_grow(cachep, flags, numa_node_id()); 2732 x = cache_grow(cachep, flags, numa_node_id());
2713 2733
2714 // cache_grow can reenable interrupts, then ac could change. 2734 /* cache_grow can reenable interrupts, then ac could change. */
2715 ac = cpu_cache_get(cachep); 2735 ac = cpu_cache_get(cachep);
2716 if (!x && ac->avail == 0) // no objects in sight? abort 2736 if (!x && ac->avail == 0) /* no objects in sight? abort */
2717 return NULL; 2737 return NULL;
2718 2738
2719 if (!ac->avail) // objects refilled by interrupt? 2739 if (!ac->avail) /* objects refilled by interrupt? */
2720 goto retry; 2740 goto retry;
2721 } 2741 }
2722 ac->touched = 1; 2742 ac->touched = 1;
2723 return ac->entry[--ac->avail]; 2743 return ac->entry[--ac->avail];
2724} 2744}
2725 2745
2726static inline void 2746static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2727cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) 2747 gfp_t flags)
2728{ 2748{
2729 might_sleep_if(flags & __GFP_WAIT); 2749 might_sleep_if(flags & __GFP_WAIT);
2730#if DEBUG 2750#if DEBUG
@@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2733} 2753}
2734 2754
2735#if DEBUG 2755#if DEBUG
2736static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, 2756static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2737 void *objp, void *caller) 2757 gfp_t flags, void *objp, void *caller)
2738{ 2758{
2739 if (!objp) 2759 if (!objp)
2740 return objp; 2760 return objp;
@@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
2754 *dbg_userword(cachep, objp) = caller; 2774 *dbg_userword(cachep, objp) = caller;
2755 2775
2756 if (cachep->flags & SLAB_RED_ZONE) { 2776 if (cachep->flags & SLAB_RED_ZONE) {
2757 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE 2777 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2758 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2778 *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2759 slab_error(cachep, 2779 slab_error(cachep, "double free, or memory outside"
2760 "double free, or memory outside" 2780 " object was overwritten");
2761 " object was overwritten");
2762 printk(KERN_ERR 2781 printk(KERN_ERR
2763 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2782 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
2764 objp, *dbg_redzone1(cachep, objp), 2783 objp, *dbg_redzone1(cachep, objp),
2765 *dbg_redzone2(cachep, objp)); 2784 *dbg_redzone2(cachep, objp));
2766 } 2785 }
2767 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2786 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2768 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2787 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2809 return objp; 2828 return objp;
2810} 2829}
2811 2830
2812static __always_inline void * 2831static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2813__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 2832 gfp_t flags, void *caller)
2814{ 2833{
2815 unsigned long save_flags; 2834 unsigned long save_flags;
2816 void *objp; 2835 void *objp;
@@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2830/* 2849/*
2831 * A interface to enable slab creation on nodeid 2850 * A interface to enable slab creation on nodeid
2832 */ 2851 */
2833static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2852static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
2853 int nodeid)
2834{ 2854{
2835 struct list_head *entry; 2855 struct list_head *entry;
2836 struct slab *slabp; 2856 struct slab *slabp;
@@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2841 l3 = cachep->nodelists[nodeid]; 2861 l3 = cachep->nodelists[nodeid];
2842 BUG_ON(!l3); 2862 BUG_ON(!l3);
2843 2863
2844 retry: 2864retry:
2845 check_irq_off(); 2865 check_irq_off();
2846 spin_lock(&l3->list_lock); 2866 spin_lock(&l3->list_lock);
2847 entry = l3->slabs_partial.next; 2867 entry = l3->slabs_partial.next;
@@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2868 /* move slabp to correct slabp list: */ 2888 /* move slabp to correct slabp list: */
2869 list_del(&slabp->list); 2889 list_del(&slabp->list);
2870 2890
2871 if (slabp->free == BUFCTL_END) { 2891 if (slabp->free == BUFCTL_END)
2872 list_add(&slabp->list, &l3->slabs_full); 2892 list_add(&slabp->list, &l3->slabs_full);
2873 } else { 2893 else
2874 list_add(&slabp->list, &l3->slabs_partial); 2894 list_add(&slabp->list, &l3->slabs_partial);
2875 }
2876 2895
2877 spin_unlock(&l3->list_lock); 2896 spin_unlock(&l3->list_lock);
2878 goto done; 2897 goto done;
2879 2898
2880 must_grow: 2899must_grow:
2881 spin_unlock(&l3->list_lock); 2900 spin_unlock(&l3->list_lock);
2882 x = cache_grow(cachep, flags, nodeid); 2901 x = cache_grow(cachep, flags, nodeid);
2883 2902
@@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2885 return NULL; 2904 return NULL;
2886 2905
2887 goto retry; 2906 goto retry;
2888 done: 2907done:
2889 return obj; 2908 return obj;
2890} 2909}
2891#endif 2910#endif
@@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2958 } 2977 }
2959 2978
2960 free_block(cachep, ac->entry, batchcount, node); 2979 free_block(cachep, ac->entry, batchcount, node);
2961 free_done: 2980free_done:
2962#if STATS 2981#if STATS
2963 { 2982 {
2964 int i = 0; 2983 int i = 0;
@@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2979#endif 2998#endif
2980 spin_unlock(&l3->list_lock); 2999 spin_unlock(&l3->list_lock);
2981 ac->avail -= batchcount; 3000 ac->avail -= batchcount;
2982 memmove(ac->entry, &(ac->entry[batchcount]), 3001 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
2983 sizeof(void *) * ac->avail);
2984} 3002}
2985 3003
2986/* 3004/*
2987 * __cache_free 3005 * Release an obj back to its cache. If the obj has a constructed state, it must
2988 * Release an obj back to its cache. If the obj has a constructed 3006 * be in this state _before_ it is released. Called with disabled ints.
2989 * state, it must be in this state _before_ it is released.
2990 *
2991 * Called with disabled ints.
2992 */ 3007 */
2993static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3008static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2994{ 3009{
@@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3007 if (unlikely(slabp->nodeid != numa_node_id())) { 3022 if (unlikely(slabp->nodeid != numa_node_id())) {
3008 struct array_cache *alien = NULL; 3023 struct array_cache *alien = NULL;
3009 int nodeid = slabp->nodeid; 3024 int nodeid = slabp->nodeid;
3010 struct kmem_list3 *l3 = 3025 struct kmem_list3 *l3;
3011 cachep->nodelists[numa_node_id()];
3012 3026
3027 l3 = cachep->nodelists[numa_node_id()];
3013 STATS_INC_NODEFREES(cachep); 3028 STATS_INC_NODEFREES(cachep);
3014 if (l3->alien && l3->alien[nodeid]) { 3029 if (l3->alien && l3->alien[nodeid]) {
3015 alien = l3->alien[nodeid]; 3030 alien = l3->alien[nodeid];
@@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
3093 if (unlikely(page_get_cache(page) != cachep)) 3108 if (unlikely(page_get_cache(page) != cachep))
3094 goto out; 3109 goto out;
3095 return 1; 3110 return 1;
3096 out: 3111out:
3097 return 0; 3112 return 0;
3098} 3113}
3099 3114
@@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3119 local_irq_save(save_flags); 3134 local_irq_save(save_flags);
3120 3135
3121 if (nodeid == -1 || nodeid == numa_node_id() || 3136 if (nodeid == -1 || nodeid == numa_node_id() ||
3122 !cachep->nodelists[nodeid]) 3137 !cachep->nodelists[nodeid])
3123 ptr = ____cache_alloc(cachep, flags); 3138 ptr = ____cache_alloc(cachep, flags);
3124 else 3139 else
3125 ptr = __cache_alloc_node(cachep, flags, nodeid); 3140 ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node);
3148 * kmalloc - allocate memory 3163 * kmalloc - allocate memory
3149 * @size: how many bytes of memory are required. 3164 * @size: how many bytes of memory are required.
3150 * @flags: the type of memory to allocate. 3165 * @flags: the type of memory to allocate.
3166 * @caller: function caller for debug tracking of the caller
3151 * 3167 *
3152 * kmalloc is the normal method of allocating memory 3168 * kmalloc is the normal method of allocating memory
3153 * in the kernel. 3169 * in the kernel.
@@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size)
3236 /* Catch derefs w/o wrappers */ 3252 /* Catch derefs w/o wrappers */
3237 return (void *)(~(unsigned long)pdata); 3253 return (void *)(~(unsigned long)pdata);
3238 3254
3239 unwind_oom: 3255unwind_oom:
3240 while (--i >= 0) { 3256 while (--i >= 0) {
3241 if (!cpu_possible(i)) 3257 if (!cpu_possible(i))
3242 continue; 3258 continue;
@@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3339 struct array_cache *nc = NULL, *new; 3355 struct array_cache *nc = NULL, *new;
3340 struct array_cache **new_alien = NULL; 3356 struct array_cache **new_alien = NULL;
3341#ifdef CONFIG_NUMA 3357#ifdef CONFIG_NUMA
3342 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3358 new_alien = alloc_alien_cache(node, cachep->limit);
3359 if (!new_alien)
3343 goto fail; 3360 goto fail;
3344#endif 3361#endif
3345 if (!(new = alloc_arraycache(node, (cachep->shared * 3362 new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
3346 cachep->batchcount), 3363 0xbaadf00d);
3347 0xbaadf00d))) 3364 if (!new)
3348 goto fail; 3365 goto fail;
3349 if ((l3 = cachep->nodelists[node])) { 3366 l3 = cachep->nodelists[node];
3350 3367 if (l3) {
3351 spin_lock_irq(&l3->list_lock); 3368 spin_lock_irq(&l3->list_lock);
3352 3369
3353 if ((nc = cachep->nodelists[node]->shared)) 3370 nc = cachep->nodelists[node]->shared;
3371 if (nc)
3354 free_block(cachep, nc->entry, nc->avail, node); 3372 free_block(cachep, nc->entry, nc->avail, node);
3355 3373
3356 l3->shared = new; 3374 l3->shared = new;
@@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3359 new_alien = NULL; 3377 new_alien = NULL;
3360 } 3378 }
3361 l3->free_limit = (1 + nr_cpus_node(node)) * 3379 l3->free_limit = (1 + nr_cpus_node(node)) *
3362 cachep->batchcount + cachep->num; 3380 cachep->batchcount + cachep->num;
3363 spin_unlock_irq(&l3->list_lock); 3381 spin_unlock_irq(&l3->list_lock);
3364 kfree(nc); 3382 kfree(nc);
3365 free_alien_cache(new_alien); 3383 free_alien_cache(new_alien);
3366 continue; 3384 continue;
3367 } 3385 }
3368 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3386 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3369 GFP_KERNEL, node))) 3387 if (!l3)
3370 goto fail; 3388 goto fail;
3371 3389
3372 kmem_list3_init(l3); 3390 kmem_list3_init(l3);
3373 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3391 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3374 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3392 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3375 l3->shared = new; 3393 l3->shared = new;
3376 l3->alien = new_alien; 3394 l3->alien = new_alien;
3377 l3->free_limit = (1 + nr_cpus_node(node)) * 3395 l3->free_limit = (1 + nr_cpus_node(node)) *
3378 cachep->batchcount + cachep->num; 3396 cachep->batchcount + cachep->num;
3379 cachep->nodelists[node] = l3; 3397 cachep->nodelists[node] = l3;
3380 } 3398 }
3381 return err; 3399 return err;
3382 fail: 3400fail:
3383 err = -ENOMEM; 3401 err = -ENOMEM;
3384 return err; 3402 return err;
3385} 3403}
@@ -3391,7 +3409,7 @@ struct ccupdate_struct {
3391 3409
3392static void do_ccupdate_local(void *info) 3410static void do_ccupdate_local(void *info)
3393{ 3411{
3394 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3412 struct ccupdate_struct *new = info;
3395 struct array_cache *old; 3413 struct array_cache *old;
3396 3414
3397 check_irq_off(); 3415 check_irq_off();
@@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info)
3401 new->new[smp_processor_id()] = old; 3419 new->new[smp_processor_id()] = old;
3402} 3420}
3403 3421
3404static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, 3422/* Always called with the cache_chain_mutex held */
3405 int shared) 3423static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3424 int batchcount, int shared)
3406{ 3425{
3407 struct ccupdate_struct new; 3426 struct ccupdate_struct new;
3408 int i, err; 3427 int i, err;
3409 3428
3410 memset(&new.new, 0, sizeof(new.new)); 3429 memset(&new.new, 0, sizeof(new.new));
3411 for_each_online_cpu(i) { 3430 for_each_online_cpu(i) {
3412 new.new[i] = 3431 new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
3413 alloc_arraycache(cpu_to_node(i), limit, batchcount); 3432 batchcount);
3414 if (!new.new[i]) { 3433 if (!new.new[i]) {
3415 for (i--; i >= 0; i--) 3434 for (i--; i >= 0; i--)
3416 kfree(new.new[i]); 3435 kfree(new.new[i]);
@@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
3419 } 3438 }
3420 new.cachep = cachep; 3439 new.cachep = cachep;
3421 3440
3422 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3441 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
3423 3442
3424 check_irq_on(); 3443 check_irq_on();
3425 spin_lock(&cachep->spinlock);
3426 cachep->batchcount = batchcount; 3444 cachep->batchcount = batchcount;
3427 cachep->limit = limit; 3445 cachep->limit = limit;
3428 cachep->shared = shared; 3446 cachep->shared = shared;
3429 spin_unlock(&cachep->spinlock);
3430 3447
3431 for_each_online_cpu(i) { 3448 for_each_online_cpu(i) {
3432 struct array_cache *ccold = new.new[i]; 3449 struct array_cache *ccold = new.new[i];
@@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
3447 return 0; 3464 return 0;
3448} 3465}
3449 3466
3467/* Called with cache_chain_mutex held always */
3450static void enable_cpucache(struct kmem_cache *cachep) 3468static void enable_cpucache(struct kmem_cache *cachep)
3451{ 3469{
3452 int err; 3470 int err;
3453 int limit, shared; 3471 int limit, shared;
3454 3472
3455 /* The head array serves three purposes: 3473 /*
3474 * The head array serves three purposes:
3456 * - create a LIFO ordering, i.e. return objects that are cache-warm 3475 * - create a LIFO ordering, i.e. return objects that are cache-warm
3457 * - reduce the number of spinlock operations. 3476 * - reduce the number of spinlock operations.
3458 * - reduce the number of linked list operations on the slab and 3477 * - reduce the number of linked list operations on the slab and
3459 * bufctl chains: array operations are cheaper. 3478 * bufctl chains: array operations are cheaper.
3460 * The numbers are guessed, we should auto-tune as described by 3479 * The numbers are guessed, we should auto-tune as described by
3461 * Bonwick. 3480 * Bonwick.
@@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
3471 else 3490 else
3472 limit = 120; 3491 limit = 120;
3473 3492
3474 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3493 /*
3494 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3475 * allocation behaviour: Most allocs on one cpu, most free operations 3495 * allocation behaviour: Most allocs on one cpu, most free operations
3476 * on another cpu. For these cases, an efficient object passing between 3496 * on another cpu. For these cases, an efficient object passing between
3477 * cpus is necessary. This is provided by a shared array. The array 3497 * cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
3486#endif 3506#endif
3487 3507
3488#if DEBUG 3508#if DEBUG
3489 /* With debugging enabled, large batchcount lead to excessively 3509 /*
3490 * long periods with disabled local interrupts. Limit the 3510 * With debugging enabled, large batchcount lead to excessively long
3491 * batchcount 3511 * periods with disabled local interrupts. Limit the batchcount
3492 */ 3512 */
3493 if (limit > 32) 3513 if (limit > 32)
3494 limit = 32; 3514 limit = 32;
@@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
3499 cachep->name, -err); 3519 cachep->name, -err);
3500} 3520}
3501 3521
3502static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, 3522/*
3503 int force, int node) 3523 * Drain an array if it contains any elements taking the l3 lock only if
3524 * necessary. Note that the l3 listlock also protects the array_cache
3525 * if drain_array() is used on the shared array.
3526 */
3527void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3528 struct array_cache *ac, int force, int node)
3504{ 3529{
3505 int tofree; 3530 int tofree;
3506 3531
3507 check_spinlock_acquired_node(cachep, node); 3532 if (!ac || !ac->avail)
3533 return;
3508 if (ac->touched && !force) { 3534 if (ac->touched && !force) {
3509 ac->touched = 0; 3535 ac->touched = 0;
3510 } else if (ac->avail) { 3536 } else {
3511 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3537 spin_lock_irq(&l3->list_lock);
3512 if (tofree > ac->avail) { 3538 if (ac->avail) {
3513 tofree = (ac->avail + 1) / 2; 3539 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3540 if (tofree > ac->avail)
3541 tofree = (ac->avail + 1) / 2;
3542 free_block(cachep, ac->entry, tofree, node);
3543 ac->avail -= tofree;
3544 memmove(ac->entry, &(ac->entry[tofree]),
3545 sizeof(void *) * ac->avail);
3514 } 3546 }
3515 free_block(cachep, ac->entry, tofree, node); 3547 spin_unlock_irq(&l3->list_lock);
3516 ac->avail -= tofree;
3517 memmove(ac->entry, &(ac->entry[tofree]),
3518 sizeof(void *) * ac->avail);
3519 } 3548 }
3520} 3549}
3521 3550
@@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
3528 * - clear the per-cpu caches for this CPU. 3557 * - clear the per-cpu caches for this CPU.
3529 * - return freeable pages to the main free memory pool. 3558 * - return freeable pages to the main free memory pool.
3530 * 3559 *
3531 * If we cannot acquire the cache chain mutex then just give up - we'll 3560 * If we cannot acquire the cache chain mutex then just give up - we'll try
3532 * try again on the next iteration. 3561 * again on the next iteration.
3533 */ 3562 */
3534static void cache_reap(void *unused) 3563static void cache_reap(void *unused)
3535{ 3564{
3536 struct list_head *walk; 3565 struct list_head *walk;
3537 struct kmem_list3 *l3; 3566 struct kmem_list3 *l3;
3567 int node = numa_node_id();
3538 3568
3539 if (!mutex_trylock(&cache_chain_mutex)) { 3569 if (!mutex_trylock(&cache_chain_mutex)) {
3540 /* Give up. Setup the next iteration. */ 3570 /* Give up. Setup the next iteration. */
@@ -3550,65 +3580,72 @@ static void cache_reap(void *unused)
3550 struct slab *slabp; 3580 struct slab *slabp;
3551 3581
3552 searchp = list_entry(walk, struct kmem_cache, next); 3582 searchp = list_entry(walk, struct kmem_cache, next);
3553
3554 if (searchp->flags & SLAB_NO_REAP)
3555 goto next;
3556
3557 check_irq_on(); 3583 check_irq_on();
3558 3584
3559 l3 = searchp->nodelists[numa_node_id()]; 3585 /*
3586 * We only take the l3 lock if absolutely necessary and we
3587 * have established with reasonable certainty that
3588 * we can do some work if the lock was obtained.
3589 */
3590 l3 = searchp->nodelists[node];
3591
3560 reap_alien(searchp, l3); 3592 reap_alien(searchp, l3);
3561 spin_lock_irq(&l3->list_lock);
3562 3593
3563 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3594 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
3564 numa_node_id());
3565 3595
3596 /*
3597 * These are racy checks but it does not matter
3598 * if we skip one check or scan twice.
3599 */
3566 if (time_after(l3->next_reap, jiffies)) 3600 if (time_after(l3->next_reap, jiffies))
3567 goto next_unlock; 3601 goto next;
3568 3602
3569 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3603 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3570 3604
3571 if (l3->shared) 3605 drain_array(searchp, l3, l3->shared, 0, node);
3572 drain_array_locked(searchp, l3->shared, 0,
3573 numa_node_id());
3574 3606
3575 if (l3->free_touched) { 3607 if (l3->free_touched) {
3576 l3->free_touched = 0; 3608 l3->free_touched = 0;
3577 goto next_unlock; 3609 goto next;
3578 } 3610 }
3579 3611
3580 tofree = 3612 tofree = (l3->free_limit + 5 * searchp->num - 1) /
3581 (l3->free_limit + 5 * searchp->num - 3613 (5 * searchp->num);
3582 1) / (5 * searchp->num);
3583 do { 3614 do {
3615 /*
3616 * Do not lock if there are no free blocks.
3617 */
3618 if (list_empty(&l3->slabs_free))
3619 break;
3620
3621 spin_lock_irq(&l3->list_lock);
3584 p = l3->slabs_free.next; 3622 p = l3->slabs_free.next;
3585 if (p == &(l3->slabs_free)) 3623 if (p == &(l3->slabs_free)) {
3624 spin_unlock_irq(&l3->list_lock);
3586 break; 3625 break;
3626 }
3587 3627
3588 slabp = list_entry(p, struct slab, list); 3628 slabp = list_entry(p, struct slab, list);
3589 BUG_ON(slabp->inuse); 3629 BUG_ON(slabp->inuse);
3590 list_del(&slabp->list); 3630 list_del(&slabp->list);
3591 STATS_INC_REAPED(searchp); 3631 STATS_INC_REAPED(searchp);
3592 3632
3593 /* Safe to drop the lock. The slab is no longer 3633 /*
3594 * linked to the cache. 3634 * Safe to drop the lock. The slab is no longer linked
3595 * searchp cannot disappear, we hold 3635 * to the cache. searchp cannot disappear, we hold
3596 * cache_chain_lock 3636 * cache_chain_lock
3597 */ 3637 */
3598 l3->free_objects -= searchp->num; 3638 l3->free_objects -= searchp->num;
3599 spin_unlock_irq(&l3->list_lock); 3639 spin_unlock_irq(&l3->list_lock);
3600 slab_destroy(searchp, slabp); 3640 slab_destroy(searchp, slabp);
3601 spin_lock_irq(&l3->list_lock);
3602 } while (--tofree > 0); 3641 } while (--tofree > 0);
3603 next_unlock: 3642next:
3604 spin_unlock_irq(&l3->list_lock);
3605 next:
3606 cond_resched(); 3643 cond_resched();
3607 } 3644 }
3608 check_irq_on(); 3645 check_irq_on();
3609 mutex_unlock(&cache_chain_mutex); 3646 mutex_unlock(&cache_chain_mutex);
3610 next_reap_node(); 3647 next_reap_node();
3611 /* Setup the next iteration */ 3648 /* Set up the next iteration */
3612 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3649 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3613} 3650}
3614 3651
@@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3658{ 3695{
3659 struct kmem_cache *cachep = p; 3696 struct kmem_cache *cachep = p;
3660 ++*pos; 3697 ++*pos;
3661 return cachep->next.next == &cache_chain ? NULL 3698 return cachep->next.next == &cache_chain ?
3662 : list_entry(cachep->next.next, struct kmem_cache, next); 3699 NULL : list_entry(cachep->next.next, struct kmem_cache, next);
3663} 3700}
3664 3701
3665static void s_stop(struct seq_file *m, void *p) 3702static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p)
3681 int node; 3718 int node;
3682 struct kmem_list3 *l3; 3719 struct kmem_list3 *l3;
3683 3720
3684 spin_lock(&cachep->spinlock);
3685 active_objs = 0; 3721 active_objs = 0;
3686 num_slabs = 0; 3722 num_slabs = 0;
3687 for_each_online_node(node) { 3723 for_each_online_node(node) {
@@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p)
3748 unsigned long node_frees = cachep->node_frees; 3784 unsigned long node_frees = cachep->node_frees;
3749 3785
3750 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3786 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3751 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); 3787 %4lu %4lu %4lu %4lu", allocs, high, grown,
3788 reaped, errors, max_freeable, node_allocs,
3789 node_frees);
3752 } 3790 }
3753 /* cpu stats */ 3791 /* cpu stats */
3754 { 3792 {
@@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p)
3762 } 3800 }
3763#endif 3801#endif
3764 seq_putc(m, '\n'); 3802 seq_putc(m, '\n');
3765 spin_unlock(&cachep->spinlock);
3766 return 0; 3803 return 0;
3767} 3804}
3768 3805
@@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3820 mutex_lock(&cache_chain_mutex); 3857 mutex_lock(&cache_chain_mutex);
3821 res = -EINVAL; 3858 res = -EINVAL;
3822 list_for_each(p, &cache_chain) { 3859 list_for_each(p, &cache_chain) {
3823 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, 3860 struct kmem_cache *cachep;
3824 next);
3825 3861
3862 cachep = list_entry(p, struct kmem_cache, next);
3826 if (!strcmp(cachep->name, kbuf)) { 3863 if (!strcmp(cachep->name, kbuf)) {
3827 if (limit < 1 || 3864 if (limit < 1 || batchcount < 1 ||
3828 batchcount < 1 || 3865 batchcount > limit || shared < 0) {
3829 batchcount > limit || shared < 0) {
3830 res = 0; 3866 res = 0;
3831 } else { 3867 } else {
3832 res = do_tune_cpucache(cachep, limit, 3868 res = do_tune_cpucache(cachep, limit,
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bddb..91b7e2026f69 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
209 */ 209 */
210void fastcall __page_cache_release(struct page *page) 210void fastcall __page_cache_release(struct page *page)
211{ 211{
212 unsigned long flags; 212 if (PageLRU(page)) {
213 struct zone *zone = page_zone(page); 213 unsigned long flags;
214 struct zone *zone = page_zone(page);
214 215
215 spin_lock_irqsave(&zone->lru_lock, flags); 216 spin_lock_irqsave(&zone->lru_lock, flags);
216 if (TestClearPageLRU(page)) 217 BUG_ON(!PageLRU(page));
218 __ClearPageLRU(page);
217 del_page_from_lru(zone, page); 219 del_page_from_lru(zone, page);
218 if (page_count(page) != 0) 220 spin_unlock_irqrestore(&zone->lru_lock, flags);
219 page = NULL; 221 }
220 spin_unlock_irqrestore(&zone->lru_lock, flags); 222 free_hot_page(page);
221 if (page)
222 free_hot_page(page);
223} 223}
224
225EXPORT_SYMBOL(__page_cache_release); 224EXPORT_SYMBOL(__page_cache_release);
226 225
227/* 226/*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
245 pagevec_init(&pages_to_free, cold); 244 pagevec_init(&pages_to_free, cold);
246 for (i = 0; i < nr; i++) { 245 for (i = 0; i < nr; i++) {
247 struct page *page = pages[i]; 246 struct page *page = pages[i];
248 struct zone *pagezone;
249 247
250 if (unlikely(PageCompound(page))) { 248 if (unlikely(PageCompound(page))) {
251 if (zone) { 249 if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
259 if (!put_page_testzero(page)) 257 if (!put_page_testzero(page))
260 continue; 258 continue;
261 259
262 pagezone = page_zone(page); 260 if (PageLRU(page)) {
263 if (pagezone != zone) { 261 struct zone *pagezone = page_zone(page);
264 if (zone) 262 if (pagezone != zone) {
265 spin_unlock_irq(&zone->lru_lock); 263 if (zone)
266 zone = pagezone; 264 spin_unlock_irq(&zone->lru_lock);
267 spin_lock_irq(&zone->lru_lock); 265 zone = pagezone;
268 } 266 spin_lock_irq(&zone->lru_lock);
269 if (TestClearPageLRU(page)) 267 }
268 BUG_ON(!PageLRU(page));
269 __ClearPageLRU(page);
270 del_page_from_lru(zone, page); 270 del_page_from_lru(zone, page);
271 if (page_count(page) == 0) { 271 }
272 if (!pagevec_add(&pages_to_free, page)) { 272
273 if (!pagevec_add(&pages_to_free, page)) {
274 if (zone) {
273 spin_unlock_irq(&zone->lru_lock); 275 spin_unlock_irq(&zone->lru_lock);
274 __pagevec_free(&pages_to_free); 276 zone = NULL;
275 pagevec_reinit(&pages_to_free);
276 zone = NULL; /* No lock is held */
277 } 277 }
278 } 278 __pagevec_free(&pages_to_free);
279 pagevec_reinit(&pages_to_free);
280 }
279 } 281 }
280 if (zone) 282 if (zone)
281 spin_unlock_irq(&zone->lru_lock); 283 spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
343 zone = pagezone; 345 zone = pagezone;
344 spin_lock_irq(&zone->lru_lock); 346 spin_lock_irq(&zone->lru_lock);
345 } 347 }
346 if (TestSetPageLRU(page)) 348 BUG_ON(PageLRU(page));
347 BUG(); 349 SetPageLRU(page);
348 add_page_to_inactive_list(zone, page); 350 add_page_to_inactive_list(zone, page);
349 } 351 }
350 if (zone) 352 if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
370 zone = pagezone; 372 zone = pagezone;
371 spin_lock_irq(&zone->lru_lock); 373 spin_lock_irq(&zone->lru_lock);
372 } 374 }
373 if (TestSetPageLRU(page)) 375 BUG_ON(PageLRU(page));
374 BUG(); 376 SetPageLRU(page);
375 if (TestSetPageActive(page)) 377 BUG_ON(PageActive(page));
376 BUG(); 378 SetPageActive(page);
377 add_page_to_active_list(zone, page); 379 add_page_to_active_list(zone, page);
378 } 380 }
379 if (zone) 381 if (zone)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e1636..d7af296833fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h> 17#include <linux/pagevec.h>
18#include <linux/migrate.h>
18 19
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20 21
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073b8..365ed6ff182d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
116 last_in_cluster = offset + SWAPFILE_CLUSTER; 116 last_in_cluster = offset + SWAPFILE_CLUSTER;
117 else if (offset == last_in_cluster) { 117 else if (offset == last_in_cluster) {
118 spin_lock(&swap_lock); 118 spin_lock(&swap_lock);
119 si->cluster_next = offset-SWAPFILE_CLUSTER-1; 119 si->cluster_next = offset-SWAPFILE_CLUSTER+1;
120 goto cluster; 120 goto cluster;
121 } 121 }
122 if (unlikely(--latency_ration < 0)) { 122 if (unlikely(--latency_ration < 0)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e2..fd572bbdc9f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/notifier.h> 34#include <linux/notifier.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/delay.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include <asm/div64.h> 39#include <asm/div64.h>
39 40
40#include <linux/swapops.h> 41#include <linux/swapops.h>
41 42
42/* possible outcome of pageout() */ 43#include "internal.h"
43typedef enum {
44 /* failed to write page out, page is locked */
45 PAGE_KEEP,
46 /* move page to the active list, page is locked */
47 PAGE_ACTIVATE,
48 /* page has been sent to the disk successfully, page is unlocked */
49 PAGE_SUCCESS,
50 /* page is clean and locked */
51 PAGE_CLEAN,
52} pageout_t;
53 44
54struct scan_control { 45struct scan_control {
55 /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
56 unsigned long nr_to_scan;
57
58 /* Incremented by the number of inactive pages that were scanned */ 46 /* Incremented by the number of inactive pages that were scanned */
59 unsigned long nr_scanned; 47 unsigned long nr_scanned;
60 48
61 /* Incremented by the number of pages reclaimed */
62 unsigned long nr_reclaimed;
63
64 unsigned long nr_mapped; /* From page_state */ 49 unsigned long nr_mapped; /* From page_state */
65 50
66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
67 unsigned int priority;
68
69 /* This context's GFP mask */ 51 /* This context's GFP mask */
70 gfp_t gfp_mask; 52 gfp_t gfp_mask;
71 53
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
183 * 165 *
184 * Returns the number of slab objects which we shrunk. 166 * Returns the number of slab objects which we shrunk.
185 */ 167 */
186int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) 168unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
169 unsigned long lru_pages)
187{ 170{
188 struct shrinker *shrinker; 171 struct shrinker *shrinker;
189 int ret = 0; 172 unsigned long ret = 0;
190 173
191 if (scanned == 0) 174 if (scanned == 0)
192 scanned = SWAP_CLUSTER_MAX; 175 scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
306} 289}
307 290
308/* 291/*
309 * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). 292 * pageout is called by shrink_page_list() for each dirty page.
293 * Calls ->writepage().
310 */ 294 */
311static pageout_t pageout(struct page *page, struct address_space *mapping) 295pageout_t pageout(struct page *page, struct address_space *mapping)
312{ 296{
313 /* 297 /*
314 * If the page is dirty, only perform writeback if that write 298 * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
376 return PAGE_CLEAN; 360 return PAGE_CLEAN;
377} 361}
378 362
379static int remove_mapping(struct address_space *mapping, struct page *page) 363int remove_mapping(struct address_space *mapping, struct page *page)
380{ 364{
381 if (!mapping) 365 if (!mapping)
382 return 0; /* truncate got there first */ 366 return 0; /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
414} 398}
415 399
416/* 400/*
417 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 401 * shrink_page_list() returns the number of reclaimed pages
418 */ 402 */
419static int shrink_list(struct list_head *page_list, struct scan_control *sc) 403static unsigned long shrink_page_list(struct list_head *page_list,
404 struct scan_control *sc)
420{ 405{
421 LIST_HEAD(ret_pages); 406 LIST_HEAD(ret_pages);
422 struct pagevec freed_pvec; 407 struct pagevec freed_pvec;
423 int pgactivate = 0; 408 int pgactivate = 0;
424 int reclaimed = 0; 409 unsigned long nr_reclaimed = 0;
425 410
426 cond_resched(); 411 cond_resched();
427 412
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
464 * Anonymous process memory has backing store? 449 * Anonymous process memory has backing store?
465 * Try to allocate it some swap space here. 450 * Try to allocate it some swap space here.
466 */ 451 */
467 if (PageAnon(page) && !PageSwapCache(page)) { 452 if (PageAnon(page) && !PageSwapCache(page))
468 if (!sc->may_swap)
469 goto keep_locked;
470 if (!add_to_swap(page, GFP_ATOMIC)) 453 if (!add_to_swap(page, GFP_ATOMIC))
471 goto activate_locked; 454 goto activate_locked;
472 }
473#endif /* CONFIG_SWAP */ 455#endif /* CONFIG_SWAP */
474 456
475 mapping = page_mapping(page); 457 mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
481 * processes. Try to unmap it here. 463 * processes. Try to unmap it here.
482 */ 464 */
483 if (page_mapped(page) && mapping) { 465 if (page_mapped(page) && mapping) {
484 /*
485 * No unmapping if we do not swap
486 */
487 if (!sc->may_swap)
488 goto keep_locked;
489
490 switch (try_to_unmap(page, 0)) { 466 switch (try_to_unmap(page, 0)) {
491 case SWAP_FAIL: 467 case SWAP_FAIL:
492 goto activate_locked; 468 goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
561 537
562free_it: 538free_it:
563 unlock_page(page); 539 unlock_page(page);
564 reclaimed++; 540 nr_reclaimed++;
565 if (!pagevec_add(&freed_pvec, page)) 541 if (!pagevec_add(&freed_pvec, page))
566 __pagevec_release_nonlru(&freed_pvec); 542 __pagevec_release_nonlru(&freed_pvec);
567 continue; 543 continue;
@@ -579,483 +555,8 @@ keep:
579 if (pagevec_count(&freed_pvec)) 555 if (pagevec_count(&freed_pvec))
580 __pagevec_release_nonlru(&freed_pvec); 556 __pagevec_release_nonlru(&freed_pvec);
581 mod_page_state(pgactivate, pgactivate); 557 mod_page_state(pgactivate, pgactivate);
582 sc->nr_reclaimed += reclaimed; 558 return nr_reclaimed;
583 return reclaimed;
584}
585
586#ifdef CONFIG_MIGRATION
587static inline void move_to_lru(struct page *page)
588{
589 list_del(&page->lru);
590 if (PageActive(page)) {
591 /*
592 * lru_cache_add_active checks that
593 * the PG_active bit is off.
594 */
595 ClearPageActive(page);
596 lru_cache_add_active(page);
597 } else {
598 lru_cache_add(page);
599 }
600 put_page(page);
601}
602
603/*
604 * Add isolated pages on the list back to the LRU.
605 *
606 * returns the number of pages put back.
607 */
608int putback_lru_pages(struct list_head *l)
609{
610 struct page *page;
611 struct page *page2;
612 int count = 0;
613
614 list_for_each_entry_safe(page, page2, l, lru) {
615 move_to_lru(page);
616 count++;
617 }
618 return count;
619}
620
621/*
622 * Non migratable page
623 */
624int fail_migrate_page(struct page *newpage, struct page *page)
625{
626 return -EIO;
627}
628EXPORT_SYMBOL(fail_migrate_page);
629
630/*
631 * swapout a single page
632 * page is locked upon entry, unlocked on exit
633 */
634static int swap_page(struct page *page)
635{
636 struct address_space *mapping = page_mapping(page);
637
638 if (page_mapped(page) && mapping)
639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
640 goto unlock_retry;
641
642 if (PageDirty(page)) {
643 /* Page is dirty, try to write it out here */
644 switch(pageout(page, mapping)) {
645 case PAGE_KEEP:
646 case PAGE_ACTIVATE:
647 goto unlock_retry;
648
649 case PAGE_SUCCESS:
650 goto retry;
651
652 case PAGE_CLEAN:
653 ; /* try to free the page below */
654 }
655 }
656
657 if (PagePrivate(page)) {
658 if (!try_to_release_page(page, GFP_KERNEL) ||
659 (!mapping && page_count(page) == 1))
660 goto unlock_retry;
661 }
662
663 if (remove_mapping(mapping, page)) {
664 /* Success */
665 unlock_page(page);
666 return 0;
667 }
668
669unlock_retry:
670 unlock_page(page);
671
672retry:
673 return -EAGAIN;
674}
675EXPORT_SYMBOL(swap_page);
676
677/*
678 * Page migration was first developed in the context of the memory hotplug
679 * project. The main authors of the migration code are:
680 *
681 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
682 * Hirokazu Takahashi <taka@valinux.co.jp>
683 * Dave Hansen <haveblue@us.ibm.com>
684 * Christoph Lameter <clameter@sgi.com>
685 */
686
687/*
688 * Remove references for a page and establish the new page with the correct
689 * basic settings to be able to stop accesses to the page.
690 */
691int migrate_page_remove_references(struct page *newpage,
692 struct page *page, int nr_refs)
693{
694 struct address_space *mapping = page_mapping(page);
695 struct page **radix_pointer;
696
697 /*
698 * Avoid doing any of the following work if the page count
699 * indicates that the page is in use or truncate has removed
700 * the page.
701 */
702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
703 return -EAGAIN;
704
705 /*
706 * Establish swap ptes for anonymous pages or destroy pte
707 * maps for files.
708 *
709 * In order to reestablish file backed mappings the fault handlers
710 * will take the radix tree_lock which may then be used to stop
711 * processses from accessing this page until the new page is ready.
712 *
713 * A process accessing via a swap pte (an anonymous page) will take a
714 * page_lock on the old page which will block the process until the
715 * migration attempt is complete. At that time the PageSwapCache bit
716 * will be examined. If the page was migrated then the PageSwapCache
717 * bit will be clear and the operation to retrieve the page will be
718 * retried which will find the new page in the radix tree. Then a new
719 * direct mapping may be generated based on the radix tree contents.
720 *
721 * If the page was not migrated then the PageSwapCache bit
722 * is still set and the operation may continue.
723 */
724 if (try_to_unmap(page, 1) == SWAP_FAIL)
725 /* A vma has VM_LOCKED set -> Permanent failure */
726 return -EPERM;
727
728 /*
729 * Give up if we were unable to remove all mappings.
730 */
731 if (page_mapcount(page))
732 return -EAGAIN;
733
734 write_lock_irq(&mapping->tree_lock);
735
736 radix_pointer = (struct page **)radix_tree_lookup_slot(
737 &mapping->page_tree,
738 page_index(page));
739
740 if (!page_mapping(page) || page_count(page) != nr_refs ||
741 *radix_pointer != page) {
742 write_unlock_irq(&mapping->tree_lock);
743 return -EAGAIN;
744 }
745
746 /*
747 * Now we know that no one else is looking at the page.
748 *
749 * Certain minimal information about a page must be available
750 * in order for other subsystems to properly handle the page if they
751 * find it through the radix tree update before we are finished
752 * copying the page.
753 */
754 get_page(newpage);
755 newpage->index = page->index;
756 newpage->mapping = page->mapping;
757 if (PageSwapCache(page)) {
758 SetPageSwapCache(newpage);
759 set_page_private(newpage, page_private(page));
760 }
761
762 *radix_pointer = newpage;
763 __put_page(page);
764 write_unlock_irq(&mapping->tree_lock);
765
766 return 0;
767}
768EXPORT_SYMBOL(migrate_page_remove_references);
769
770/*
771 * Copy the page to its new location
772 */
773void migrate_page_copy(struct page *newpage, struct page *page)
774{
775 copy_highpage(newpage, page);
776
777 if (PageError(page))
778 SetPageError(newpage);
779 if (PageReferenced(page))
780 SetPageReferenced(newpage);
781 if (PageUptodate(page))
782 SetPageUptodate(newpage);
783 if (PageActive(page))
784 SetPageActive(newpage);
785 if (PageChecked(page))
786 SetPageChecked(newpage);
787 if (PageMappedToDisk(page))
788 SetPageMappedToDisk(newpage);
789
790 if (PageDirty(page)) {
791 clear_page_dirty_for_io(page);
792 set_page_dirty(newpage);
793 }
794
795 ClearPageSwapCache(page);
796 ClearPageActive(page);
797 ClearPagePrivate(page);
798 set_page_private(page, 0);
799 page->mapping = NULL;
800
801 /*
802 * If any waiters have accumulated on the new page then
803 * wake them up.
804 */
805 if (PageWriteback(newpage))
806 end_page_writeback(newpage);
807}
808EXPORT_SYMBOL(migrate_page_copy);
809
810/*
811 * Common logic to directly migrate a single page suitable for
812 * pages that do not use PagePrivate.
813 *
814 * Pages are locked upon entry and exit.
815 */
816int migrate_page(struct page *newpage, struct page *page)
817{
818 int rc;
819
820 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
821
822 rc = migrate_page_remove_references(newpage, page, 2);
823
824 if (rc)
825 return rc;
826
827 migrate_page_copy(newpage, page);
828
829 /*
830 * Remove auxiliary swap entries and replace
831 * them with real ptes.
832 *
833 * Note that a real pte entry will allow processes that are not
834 * waiting on the page lock to use the new page via the page tables
835 * before the new page is unlocked.
836 */
837 remove_from_swap(newpage);
838 return 0;
839} 559}
840EXPORT_SYMBOL(migrate_page);
841
842/*
843 * migrate_pages
844 *
845 * Two lists are passed to this function. The first list
846 * contains the pages isolated from the LRU to be migrated.
847 * The second list contains new pages that the pages isolated
848 * can be moved to. If the second list is NULL then all
849 * pages are swapped out.
850 *
851 * The function returns after 10 attempts or if no pages
852 * are movable anymore because to has become empty
853 * or no retryable pages exist anymore.
854 *
855 * Return: Number of pages not migrated when "to" ran empty.
856 */
857int migrate_pages(struct list_head *from, struct list_head *to,
858 struct list_head *moved, struct list_head *failed)
859{
860 int retry;
861 int nr_failed = 0;
862 int pass = 0;
863 struct page *page;
864 struct page *page2;
865 int swapwrite = current->flags & PF_SWAPWRITE;
866 int rc;
867
868 if (!swapwrite)
869 current->flags |= PF_SWAPWRITE;
870
871redo:
872 retry = 0;
873
874 list_for_each_entry_safe(page, page2, from, lru) {
875 struct page *newpage = NULL;
876 struct address_space *mapping;
877
878 cond_resched();
879
880 rc = 0;
881 if (page_count(page) == 1)
882 /* page was freed from under us. So we are done. */
883 goto next;
884
885 if (to && list_empty(to))
886 break;
887
888 /*
889 * Skip locked pages during the first two passes to give the
890 * functions holding the lock time to release the page. Later we
891 * use lock_page() to have a higher chance of acquiring the
892 * lock.
893 */
894 rc = -EAGAIN;
895 if (pass > 2)
896 lock_page(page);
897 else
898 if (TestSetPageLocked(page))
899 goto next;
900
901 /*
902 * Only wait on writeback if we have already done a pass where
903 * we we may have triggered writeouts for lots of pages.
904 */
905 if (pass > 0) {
906 wait_on_page_writeback(page);
907 } else {
908 if (PageWriteback(page))
909 goto unlock_page;
910 }
911
912 /*
913 * Anonymous pages must have swap cache references otherwise
914 * the information contained in the page maps cannot be
915 * preserved.
916 */
917 if (PageAnon(page) && !PageSwapCache(page)) {
918 if (!add_to_swap(page, GFP_KERNEL)) {
919 rc = -ENOMEM;
920 goto unlock_page;
921 }
922 }
923
924 if (!to) {
925 rc = swap_page(page);
926 goto next;
927 }
928
929 newpage = lru_to_page(to);
930 lock_page(newpage);
931
932 /*
933 * Pages are properly locked and writeback is complete.
934 * Try to migrate the page.
935 */
936 mapping = page_mapping(page);
937 if (!mapping)
938 goto unlock_both;
939
940 if (mapping->a_ops->migratepage) {
941 /*
942 * Most pages have a mapping and most filesystems
943 * should provide a migration function. Anonymous
944 * pages are part of swap space which also has its
945 * own migration function. This is the most common
946 * path for page migration.
947 */
948 rc = mapping->a_ops->migratepage(newpage, page);
949 goto unlock_both;
950 }
951
952 /*
953 * Default handling if a filesystem does not provide
954 * a migration function. We can only migrate clean
955 * pages so try to write out any dirty pages first.
956 */
957 if (PageDirty(page)) {
958 switch (pageout(page, mapping)) {
959 case PAGE_KEEP:
960 case PAGE_ACTIVATE:
961 goto unlock_both;
962
963 case PAGE_SUCCESS:
964 unlock_page(newpage);
965 goto next;
966
967 case PAGE_CLEAN:
968 ; /* try to migrate the page below */
969 }
970 }
971
972 /*
973 * Buffers are managed in a filesystem specific way.
974 * We must have no buffers or drop them.
975 */
976 if (!page_has_buffers(page) ||
977 try_to_release_page(page, GFP_KERNEL)) {
978 rc = migrate_page(newpage, page);
979 goto unlock_both;
980 }
981
982 /*
983 * On early passes with mapped pages simply
984 * retry. There may be a lock held for some
985 * buffers that may go away. Later
986 * swap them out.
987 */
988 if (pass > 4) {
989 /*
990 * Persistently unable to drop buffers..... As a
991 * measure of last resort we fall back to
992 * swap_page().
993 */
994 unlock_page(newpage);
995 newpage = NULL;
996 rc = swap_page(page);
997 goto next;
998 }
999
1000unlock_both:
1001 unlock_page(newpage);
1002
1003unlock_page:
1004 unlock_page(page);
1005
1006next:
1007 if (rc == -EAGAIN) {
1008 retry++;
1009 } else if (rc) {
1010 /* Permanent failure */
1011 list_move(&page->lru, failed);
1012 nr_failed++;
1013 } else {
1014 if (newpage) {
1015 /* Successful migration. Return page to LRU */
1016 move_to_lru(newpage);
1017 }
1018 list_move(&page->lru, moved);
1019 }
1020 }
1021 if (retry && pass++ < 10)
1022 goto redo;
1023
1024 if (!swapwrite)
1025 current->flags &= ~PF_SWAPWRITE;
1026
1027 return nr_failed + retry;
1028}
1029
1030/*
1031 * Isolate one page from the LRU lists and put it on the
1032 * indicated list with elevated refcount.
1033 *
1034 * Result:
1035 * 0 = page not on LRU list
1036 * 1 = page removed from LRU list and added to the specified list.
1037 */
1038int isolate_lru_page(struct page *page)
1039{
1040 int ret = 0;
1041
1042 if (PageLRU(page)) {
1043 struct zone *zone = page_zone(page);
1044 spin_lock_irq(&zone->lru_lock);
1045 if (TestClearPageLRU(page)) {
1046 ret = 1;
1047 get_page(page);
1048 if (PageActive(page))
1049 del_page_from_active_list(zone, page);
1050 else
1051 del_page_from_inactive_list(zone, page);
1052 }
1053 spin_unlock_irq(&zone->lru_lock);
1054 }
1055
1056 return ret;
1057}
1058#endif
1059 560
1060/* 561/*
1061 * zone->lru_lock is heavily contended. Some of the functions that 562 * zone->lru_lock is heavily contended. Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
1074 * 575 *
1075 * returns how many pages were moved onto *@dst. 576 * returns how many pages were moved onto *@dst.
1076 */ 577 */
1077static int isolate_lru_pages(int nr_to_scan, struct list_head *src, 578static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1078 struct list_head *dst, int *scanned) 579 struct list_head *src, struct list_head *dst,
580 unsigned long *scanned)
1079{ 581{
1080 int nr_taken = 0; 582 unsigned long nr_taken = 0;
1081 struct page *page; 583 struct page *page;
1082 int scan = 0; 584 unsigned long scan;
1083 585
1084 while (scan++ < nr_to_scan && !list_empty(src)) { 586 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
587 struct list_head *target;
1085 page = lru_to_page(src); 588 page = lru_to_page(src);
1086 prefetchw_prev_lru_page(page, src, flags); 589 prefetchw_prev_lru_page(page, src, flags);
1087 590
1088 if (!TestClearPageLRU(page)) 591 BUG_ON(!PageLRU(page));
1089 BUG(); 592
1090 list_del(&page->lru); 593 list_del(&page->lru);
1091 if (get_page_testone(page)) { 594 target = src;
595 if (likely(get_page_unless_zero(page))) {
1092 /* 596 /*
1093 * It is being freed elsewhere 597 * Be careful not to clear PageLRU until after we're
598 * sure the page is not being freed elsewhere -- the
599 * page release code relies on it.
1094 */ 600 */
1095 __put_page(page); 601 ClearPageLRU(page);
1096 SetPageLRU(page); 602 target = dst;
1097 list_add(&page->lru, src);
1098 continue;
1099 } else {
1100 list_add(&page->lru, dst);
1101 nr_taken++; 603 nr_taken++;
1102 } 604 } /* else it is being freed elsewhere */
605
606 list_add(&page->lru, target);
1103 } 607 }
1104 608
1105 *scanned = scan; 609 *scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
1107} 611}
1108 612
1109/* 613/*
1110 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed 614 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
615 * of reclaimed pages
1111 */ 616 */
1112static void shrink_cache(struct zone *zone, struct scan_control *sc) 617static unsigned long shrink_inactive_list(unsigned long max_scan,
618 struct zone *zone, struct scan_control *sc)
1113{ 619{
1114 LIST_HEAD(page_list); 620 LIST_HEAD(page_list);
1115 struct pagevec pvec; 621 struct pagevec pvec;
1116 int max_scan = sc->nr_to_scan; 622 unsigned long nr_scanned = 0;
623 unsigned long nr_reclaimed = 0;
1117 624
1118 pagevec_init(&pvec, 1); 625 pagevec_init(&pvec, 1);
1119 626
1120 lru_add_drain(); 627 lru_add_drain();
1121 spin_lock_irq(&zone->lru_lock); 628 spin_lock_irq(&zone->lru_lock);
1122 while (max_scan > 0) { 629 do {
1123 struct page *page; 630 struct page *page;
1124 int nr_taken; 631 unsigned long nr_taken;
1125 int nr_scan; 632 unsigned long nr_scan;
1126 int nr_freed; 633 unsigned long nr_freed;
1127 634
1128 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 635 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
1129 &zone->inactive_list, 636 &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1132 zone->pages_scanned += nr_scan; 639 zone->pages_scanned += nr_scan;
1133 spin_unlock_irq(&zone->lru_lock); 640 spin_unlock_irq(&zone->lru_lock);
1134 641
1135 if (nr_taken == 0) 642 nr_scanned += nr_scan;
1136 goto done; 643 nr_freed = shrink_page_list(&page_list, sc);
1137 644 nr_reclaimed += nr_freed;
1138 max_scan -= nr_scan;
1139 nr_freed = shrink_list(&page_list, sc);
1140
1141 local_irq_disable(); 645 local_irq_disable();
1142 if (current_is_kswapd()) { 646 if (current_is_kswapd()) {
1143 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 647 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1146 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 650 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
1147 __mod_page_state_zone(zone, pgsteal, nr_freed); 651 __mod_page_state_zone(zone, pgsteal, nr_freed);
1148 652
653 if (nr_taken == 0)
654 goto done;
655
1149 spin_lock(&zone->lru_lock); 656 spin_lock(&zone->lru_lock);
1150 /* 657 /*
1151 * Put back any unfreeable pages. 658 * Put back any unfreeable pages.
1152 */ 659 */
1153 while (!list_empty(&page_list)) { 660 while (!list_empty(&page_list)) {
1154 page = lru_to_page(&page_list); 661 page = lru_to_page(&page_list);
1155 if (TestSetPageLRU(page)) 662 BUG_ON(PageLRU(page));
1156 BUG(); 663 SetPageLRU(page);
1157 list_del(&page->lru); 664 list_del(&page->lru);
1158 if (PageActive(page)) 665 if (PageActive(page))
1159 add_page_to_active_list(zone, page); 666 add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1165 spin_lock_irq(&zone->lru_lock); 672 spin_lock_irq(&zone->lru_lock);
1166 } 673 }
1167 } 674 }
1168 } 675 } while (nr_scanned < max_scan);
1169 spin_unlock_irq(&zone->lru_lock); 676 spin_unlock(&zone->lru_lock);
1170done: 677done:
678 local_irq_enable();
1171 pagevec_release(&pvec); 679 pagevec_release(&pvec);
680 return nr_reclaimed;
1172} 681}
1173 682
1174/* 683/*
@@ -1188,13 +697,12 @@ done:
1188 * The downside is that we have to touch page->_count against each page. 697 * The downside is that we have to touch page->_count against each page.
1189 * But we had to alter page->flags anyway. 698 * But we had to alter page->flags anyway.
1190 */ 699 */
1191static void 700static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1192refill_inactive_zone(struct zone *zone, struct scan_control *sc) 701 struct scan_control *sc)
1193{ 702{
1194 int pgmoved; 703 unsigned long pgmoved;
1195 int pgdeactivate = 0; 704 int pgdeactivate = 0;
1196 int pgscanned; 705 unsigned long pgscanned;
1197 int nr_pages = sc->nr_to_scan;
1198 LIST_HEAD(l_hold); /* The pages which were snipped off */ 706 LIST_HEAD(l_hold); /* The pages which were snipped off */
1199 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 707 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
1200 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 708 LIST_HEAD(l_active); /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1202 struct pagevec pvec; 710 struct pagevec pvec;
1203 int reclaim_mapped = 0; 711 int reclaim_mapped = 0;
1204 712
1205 if (unlikely(sc->may_swap)) { 713 if (sc->may_swap) {
1206 long mapped_ratio; 714 long mapped_ratio;
1207 long distress; 715 long distress;
1208 long swap_tendency; 716 long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1272 while (!list_empty(&l_inactive)) { 780 while (!list_empty(&l_inactive)) {
1273 page = lru_to_page(&l_inactive); 781 page = lru_to_page(&l_inactive);
1274 prefetchw_prev_lru_page(page, &l_inactive, flags); 782 prefetchw_prev_lru_page(page, &l_inactive, flags);
1275 if (TestSetPageLRU(page)) 783 BUG_ON(PageLRU(page));
1276 BUG(); 784 SetPageLRU(page);
1277 if (!TestClearPageActive(page)) 785 BUG_ON(!PageActive(page));
1278 BUG(); 786 ClearPageActive(page);
787
1279 list_move(&page->lru, &zone->inactive_list); 788 list_move(&page->lru, &zone->inactive_list);
1280 pgmoved++; 789 pgmoved++;
1281 if (!pagevec_add(&pvec, page)) { 790 if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1301 while (!list_empty(&l_active)) { 810 while (!list_empty(&l_active)) {
1302 page = lru_to_page(&l_active); 811 page = lru_to_page(&l_active);
1303 prefetchw_prev_lru_page(page, &l_active, flags); 812 prefetchw_prev_lru_page(page, &l_active, flags);
1304 if (TestSetPageLRU(page)) 813 BUG_ON(PageLRU(page));
1305 BUG(); 814 SetPageLRU(page);
1306 BUG_ON(!PageActive(page)); 815 BUG_ON(!PageActive(page));
1307 list_move(&page->lru, &zone->active_list); 816 list_move(&page->lru, &zone->active_list);
1308 pgmoved++; 817 pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1327/* 836/*
1328 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 837 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1329 */ 838 */
1330static void 839static unsigned long shrink_zone(int priority, struct zone *zone,
1331shrink_zone(struct zone *zone, struct scan_control *sc) 840 struct scan_control *sc)
1332{ 841{
1333 unsigned long nr_active; 842 unsigned long nr_active;
1334 unsigned long nr_inactive; 843 unsigned long nr_inactive;
844 unsigned long nr_to_scan;
845 unsigned long nr_reclaimed = 0;
1335 846
1336 atomic_inc(&zone->reclaim_in_progress); 847 atomic_inc(&zone->reclaim_in_progress);
1337 848
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1339 * Add one to `nr_to_scan' just to make sure that the kernel will 850 * Add one to `nr_to_scan' just to make sure that the kernel will
1340 * slowly sift through the active list. 851 * slowly sift through the active list.
1341 */ 852 */
1342 zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; 853 zone->nr_scan_active += (zone->nr_active >> priority) + 1;
1343 nr_active = zone->nr_scan_active; 854 nr_active = zone->nr_scan_active;
1344 if (nr_active >= sc->swap_cluster_max) 855 if (nr_active >= sc->swap_cluster_max)
1345 zone->nr_scan_active = 0; 856 zone->nr_scan_active = 0;
1346 else 857 else
1347 nr_active = 0; 858 nr_active = 0;
1348 859
1349 zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; 860 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
1350 nr_inactive = zone->nr_scan_inactive; 861 nr_inactive = zone->nr_scan_inactive;
1351 if (nr_inactive >= sc->swap_cluster_max) 862 if (nr_inactive >= sc->swap_cluster_max)
1352 zone->nr_scan_inactive = 0; 863 zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1355 866
1356 while (nr_active || nr_inactive) { 867 while (nr_active || nr_inactive) {
1357 if (nr_active) { 868 if (nr_active) {
1358 sc->nr_to_scan = min(nr_active, 869 nr_to_scan = min(nr_active,
1359 (unsigned long)sc->swap_cluster_max); 870 (unsigned long)sc->swap_cluster_max);
1360 nr_active -= sc->nr_to_scan; 871 nr_active -= nr_to_scan;
1361 refill_inactive_zone(zone, sc); 872 shrink_active_list(nr_to_scan, zone, sc);
1362 } 873 }
1363 874
1364 if (nr_inactive) { 875 if (nr_inactive) {
1365 sc->nr_to_scan = min(nr_inactive, 876 nr_to_scan = min(nr_inactive,
1366 (unsigned long)sc->swap_cluster_max); 877 (unsigned long)sc->swap_cluster_max);
1367 nr_inactive -= sc->nr_to_scan; 878 nr_inactive -= nr_to_scan;
1368 shrink_cache(zone, sc); 879 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
880 sc);
1369 } 881 }
1370 } 882 }
1371 883
1372 throttle_vm_writeout(); 884 throttle_vm_writeout();
1373 885
1374 atomic_dec(&zone->reclaim_in_progress); 886 atomic_dec(&zone->reclaim_in_progress);
887 return nr_reclaimed;
1375} 888}
1376 889
1377/* 890/*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1390 * If a zone is deemed to be full of pinned pages then just give it a light 903 * If a zone is deemed to be full of pinned pages then just give it a light
1391 * scan then give up on it. 904 * scan then give up on it.
1392 */ 905 */
1393static void 906static unsigned long shrink_zones(int priority, struct zone **zones,
1394shrink_caches(struct zone **zones, struct scan_control *sc) 907 struct scan_control *sc)
1395{ 908{
909 unsigned long nr_reclaimed = 0;
1396 int i; 910 int i;
1397 911
1398 for (i = 0; zones[i] != NULL; i++) { 912 for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1404 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 918 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1405 continue; 919 continue;
1406 920
1407 zone->temp_priority = sc->priority; 921 zone->temp_priority = priority;
1408 if (zone->prev_priority > sc->priority) 922 if (zone->prev_priority > priority)
1409 zone->prev_priority = sc->priority; 923 zone->prev_priority = priority;
1410 924
1411 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) 925 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1412 continue; /* Let kswapd poll it */ 926 continue; /* Let kswapd poll it */
1413 927
1414 shrink_zone(zone, sc); 928 nr_reclaimed += shrink_zone(priority, zone, sc);
1415 } 929 }
930 return nr_reclaimed;
1416} 931}
1417 932
1418/* 933/*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1428 * holds filesystem locks which prevent writeout this might not work, and the 943 * holds filesystem locks which prevent writeout this might not work, and the
1429 * allocation attempt will fail. 944 * allocation attempt will fail.
1430 */ 945 */
1431int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 946unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1432{ 947{
1433 int priority; 948 int priority;
1434 int ret = 0; 949 int ret = 0;
1435 int total_scanned = 0, total_reclaimed = 0; 950 unsigned long total_scanned = 0;
951 unsigned long nr_reclaimed = 0;
1436 struct reclaim_state *reclaim_state = current->reclaim_state; 952 struct reclaim_state *reclaim_state = current->reclaim_state;
1437 struct scan_control sc;
1438 unsigned long lru_pages = 0; 953 unsigned long lru_pages = 0;
1439 int i; 954 int i;
1440 955 struct scan_control sc = {
1441 sc.gfp_mask = gfp_mask; 956 .gfp_mask = gfp_mask,
1442 sc.may_writepage = !laptop_mode; 957 .may_writepage = !laptop_mode,
1443 sc.may_swap = 1; 958 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1,
960 };
1444 961
1445 inc_page_state(allocstall); 962 inc_page_state(allocstall);
1446 963
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1457 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 974 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1458 sc.nr_mapped = read_page_state(nr_mapped); 975 sc.nr_mapped = read_page_state(nr_mapped);
1459 sc.nr_scanned = 0; 976 sc.nr_scanned = 0;
1460 sc.nr_reclaimed = 0;
1461 sc.priority = priority;
1462 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1463 if (!priority) 977 if (!priority)
1464 disable_swap_token(); 978 disable_swap_token();
1465 shrink_caches(zones, &sc); 979 nr_reclaimed += shrink_zones(priority, zones, &sc);
1466 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 980 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
1467 if (reclaim_state) { 981 if (reclaim_state) {
1468 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 982 nr_reclaimed += reclaim_state->reclaimed_slab;
1469 reclaim_state->reclaimed_slab = 0; 983 reclaim_state->reclaimed_slab = 0;
1470 } 984 }
1471 total_scanned += sc.nr_scanned; 985 total_scanned += sc.nr_scanned;
1472 total_reclaimed += sc.nr_reclaimed; 986 if (nr_reclaimed >= sc.swap_cluster_max) {
1473 if (total_reclaimed >= sc.swap_cluster_max) {
1474 ret = 1; 987 ret = 1;
1475 goto out; 988 goto out;
1476 } 989 }
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1482 * that's undesirable in laptop mode, where we *want* lumpy 995 * that's undesirable in laptop mode, where we *want* lumpy
1483 * writeout. So in laptop mode, write out the whole world. 996 * writeout. So in laptop mode, write out the whole world.
1484 */ 997 */
1485 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { 998 if (total_scanned > sc.swap_cluster_max +
999 sc.swap_cluster_max / 2) {
1486 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1000 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1487 sc.may_writepage = 1; 1001 sc.may_writepage = 1;
1488 } 1002 }
@@ -1528,22 +1042,26 @@ out:
1528 * the page allocator fallback scheme to ensure that aging of pages is balanced 1042 * the page allocator fallback scheme to ensure that aging of pages is balanced
1529 * across the zones. 1043 * across the zones.
1530 */ 1044 */
1531static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) 1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1046 int order)
1532{ 1047{
1533 int to_free = nr_pages; 1048 unsigned long to_free = nr_pages;
1534 int all_zones_ok; 1049 int all_zones_ok;
1535 int priority; 1050 int priority;
1536 int i; 1051 int i;
1537 int total_scanned, total_reclaimed; 1052 unsigned long total_scanned;
1053 unsigned long nr_reclaimed;
1538 struct reclaim_state *reclaim_state = current->reclaim_state; 1054 struct reclaim_state *reclaim_state = current->reclaim_state;
1539 struct scan_control sc; 1055 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
1059 };
1540 1060
1541loop_again: 1061loop_again:
1542 total_scanned = 0; 1062 total_scanned = 0;
1543 total_reclaimed = 0; 1063 nr_reclaimed = 0;
1544 sc.gfp_mask = GFP_KERNEL; 1064 sc.may_writepage = !laptop_mode,
1545 sc.may_writepage = !laptop_mode;
1546 sc.may_swap = 1;
1547 sc.nr_mapped = read_page_state(nr_mapped); 1065 sc.nr_mapped = read_page_state(nr_mapped);
1548 1066
1549 inc_page_state(pageoutrun); 1067 inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
1624 if (zone->prev_priority > priority) 1142 if (zone->prev_priority > priority)
1625 zone->prev_priority = priority; 1143 zone->prev_priority = priority;
1626 sc.nr_scanned = 0; 1144 sc.nr_scanned = 0;
1627 sc.nr_reclaimed = 0; 1145 nr_reclaimed += shrink_zone(priority, zone, &sc);
1628 sc.priority = priority;
1629 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1630 shrink_zone(zone, &sc);
1631 reclaim_state->reclaimed_slab = 0; 1146 reclaim_state->reclaimed_slab = 0;
1632 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1147 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1633 lru_pages); 1148 lru_pages);
1634 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 1149 nr_reclaimed += reclaim_state->reclaimed_slab;
1635 total_reclaimed += sc.nr_reclaimed;
1636 total_scanned += sc.nr_scanned; 1150 total_scanned += sc.nr_scanned;
1637 if (zone->all_unreclaimable) 1151 if (zone->all_unreclaimable)
1638 continue; 1152 continue;
@@ -1645,10 +1159,10 @@ scan:
1645 * even in laptop mode 1159 * even in laptop mode
1646 */ 1160 */
1647 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1161 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1648 total_scanned > total_reclaimed+total_reclaimed/2) 1162 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1649 sc.may_writepage = 1; 1163 sc.may_writepage = 1;
1650 } 1164 }
1651 if (nr_pages && to_free > total_reclaimed) 1165 if (nr_pages && to_free > nr_reclaimed)
1652 continue; /* swsusp: need to do more work */ 1166 continue; /* swsusp: need to do more work */
1653 if (all_zones_ok) 1167 if (all_zones_ok)
1654 break; /* kswapd: all done */ 1168 break; /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
1665 * matches the direct reclaim path behaviour in terms of impact 1179 * matches the direct reclaim path behaviour in terms of impact
1666 * on zone->*_priority. 1180 * on zone->*_priority.
1667 */ 1181 */
1668 if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) 1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
1669 break; 1183 break;
1670 } 1184 }
1671out: 1185out:
@@ -1679,7 +1193,7 @@ out:
1679 goto loop_again; 1193 goto loop_again;
1680 } 1194 }
1681 1195
1682 return total_reclaimed; 1196 return nr_reclaimed;
1683} 1197}
1684 1198
1685/* 1199/*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
1779 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
1780 * pages. 1294 * pages.
1781 */ 1295 */
1782int shrink_all_memory(int nr_pages) 1296unsigned long shrink_all_memory(unsigned long nr_pages)
1783{ 1297{
1784 pg_data_t *pgdat; 1298 pg_data_t *pgdat;
1785 int nr_to_free = nr_pages; 1299 unsigned long nr_to_free = nr_pages;
1786 int ret = 0; 1300 unsigned long ret = 0;
1301 unsigned retry = 2;
1787 struct reclaim_state reclaim_state = { 1302 struct reclaim_state reclaim_state = {
1788 .reclaimed_slab = 0, 1303 .reclaimed_slab = 0,
1789 }; 1304 };
1790 1305
1791 current->reclaim_state = &reclaim_state; 1306 current->reclaim_state = &reclaim_state;
1307repeat:
1792 for_each_pgdat(pgdat) { 1308 for_each_pgdat(pgdat) {
1793 int freed; 1309 unsigned long freed;
1310
1794 freed = balance_pgdat(pgdat, nr_to_free, 0); 1311 freed = balance_pgdat(pgdat, nr_to_free, 0);
1795 ret += freed; 1312 ret += freed;
1796 nr_to_free -= freed; 1313 nr_to_free -= freed;
1797 if (nr_to_free <= 0) 1314 if ((long)nr_to_free <= 0)
1798 break; 1315 break;
1799 } 1316 }
1317 if (retry-- && ret < nr_pages) {
1318 blk_congestion_wait(WRITE, HZ/5);
1319 goto repeat;
1320 }
1800 current->reclaim_state = NULL; 1321 current->reclaim_state = NULL;
1801 return ret; 1322 return ret;
1802} 1323}
@@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages)
1808 away, we get changed to run anywhere: as the first one comes back, 1329 away, we get changed to run anywhere: as the first one comes back,
1809 restore their cpu bindings. */ 1330 restore their cpu bindings. */
1810static int __devinit cpu_callback(struct notifier_block *nfb, 1331static int __devinit cpu_callback(struct notifier_block *nfb,
1811 unsigned long action, 1332 unsigned long action, void *hcpu)
1812 void *hcpu)
1813{ 1333{
1814 pg_data_t *pgdat; 1334 pg_data_t *pgdat;
1815 cpumask_t mask; 1335 cpumask_t mask;
@@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1829static int __init kswapd_init(void) 1349static int __init kswapd_init(void)
1830{ 1350{
1831 pg_data_t *pgdat; 1351 pg_data_t *pgdat;
1352
1832 swap_setup(); 1353 swap_setup();
1833 for_each_pgdat(pgdat) 1354 for_each_pgdat(pgdat) {
1834 pgdat->kswapd 1355 pid_t pid;
1835 = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); 1356
1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1358 BUG_ON(pid < 0);
1359 pgdat->kswapd = find_task_by_pid(pid);
1360 }
1836 total_memory = nr_free_pagecache_pages(); 1361 total_memory = nr_free_pagecache_pages();
1837 hotcpu_notifier(cpu_callback, 0); 1362 hotcpu_notifier(cpu_callback, 0);
1838 return 0; 1363 return 0;
@@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
1874/* 1399/*
1875 * Try to free up some pages from this zone through reclaim. 1400 * Try to free up some pages from this zone through reclaim.
1876 */ 1401 */
1877int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1402static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1878{ 1403{
1879 int nr_pages; 1404 /* Minimum pages needed in order to stay on node */
1405 const unsigned long nr_pages = 1 << order;
1880 struct task_struct *p = current; 1406 struct task_struct *p = current;
1881 struct reclaim_state reclaim_state; 1407 struct reclaim_state reclaim_state;
1882 struct scan_control sc; 1408 int priority;
1883 cpumask_t mask; 1409 unsigned long nr_reclaimed = 0;
1884 int node_id; 1410 struct scan_control sc = {
1885 1411 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1886 if (time_before(jiffies, 1412 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1887 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1413 .nr_mapped = read_page_state(nr_mapped),
1888 return 0; 1414 .swap_cluster_max = max_t(unsigned long, nr_pages,
1889 1415 SWAP_CLUSTER_MAX),
1890 if (!(gfp_mask & __GFP_WAIT) || 1416 .gfp_mask = gfp_mask,
1891 zone->all_unreclaimable || 1417 };
1892 atomic_read(&zone->reclaim_in_progress) > 0 ||
1893 (p->flags & PF_MEMALLOC))
1894 return 0;
1895
1896 node_id = zone->zone_pgdat->node_id;
1897 mask = node_to_cpumask(node_id);
1898 if (!cpus_empty(mask) && node_id != numa_node_id())
1899 return 0;
1900
1901 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1902 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1903 sc.nr_scanned = 0;
1904 sc.nr_reclaimed = 0;
1905 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1906 sc.nr_mapped = read_page_state(nr_mapped);
1907 sc.gfp_mask = gfp_mask;
1908 1418
1909 disable_swap_token(); 1419 disable_swap_token();
1910
1911 nr_pages = 1 << order;
1912 if (nr_pages > SWAP_CLUSTER_MAX)
1913 sc.swap_cluster_max = nr_pages;
1914 else
1915 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1916
1917 cond_resched(); 1420 cond_resched();
1918 /* 1421 /*
1919 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1422 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1928 * Free memory by calling shrink zone with increasing priorities 1431 * Free memory by calling shrink zone with increasing priorities
1929 * until we have enough memory freed. 1432 * until we have enough memory freed.
1930 */ 1433 */
1434 priority = ZONE_RECLAIM_PRIORITY;
1931 do { 1435 do {
1932 sc.priority--; 1436 nr_reclaimed += shrink_zone(priority, zone, &sc);
1933 shrink_zone(zone, &sc); 1437 priority--;
1438 } while (priority >= 0 && nr_reclaimed < nr_pages);
1934 1439
1935 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); 1440 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1936
1937 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1938 /* 1441 /*
1939 * shrink_slab does not currently allow us to determine 1442 * shrink_slab() does not currently allow us to determine how
1940 * how many pages were freed in the zone. So we just 1443 * many pages were freed in this zone. So we just shake the slab
1941 * shake the slab and then go offnode for a single allocation. 1444 * a bit and then go off node for this particular allocation
1445 * despite possibly having freed enough memory to allocate in
1446 * this zone. If we freed local memory then the next
1447 * allocations will be local again.
1942 * 1448 *
1943 * shrink_slab will free memory on all zones and may take 1449 * shrink_slab will free memory on all zones and may take
1944 * a long time. 1450 * a long time.
@@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1949 p->reclaim_state = NULL; 1455 p->reclaim_state = NULL;
1950 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1456 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1951 1457
1952 if (sc.nr_reclaimed == 0) 1458 if (nr_reclaimed == 0) {
1459 /*
1460 * We were unable to reclaim enough pages to stay on node. We
1461 * now allow off node accesses for a certain time period before
1462 * trying again to reclaim pages from the local zone.
1463 */
1953 zone->last_unsuccessful_zone_reclaim = jiffies; 1464 zone->last_unsuccessful_zone_reclaim = jiffies;
1465 }
1954 1466
1955 return sc.nr_reclaimed >= nr_pages; 1467 return nr_reclaimed >= nr_pages;
1956} 1468}
1957#endif
1958 1469
1470int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1471{
1472 cpumask_t mask;
1473 int node_id;
1474
1475 /*
1476 * Do not reclaim if there was a recent unsuccessful attempt at zone
1477 * reclaim. In that case we let allocations go off node for the
1478 * zone_reclaim_interval. Otherwise we would scan for each off-node
1479 * page allocation.
1480 */
1481 if (time_before(jiffies,
1482 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1483 return 0;
1484
1485 /*
1486 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
1487 * not have reclaimable pages and if we should not delay the allocation
1488 * then do not scan.
1489 */
1490 if (!(gfp_mask & __GFP_WAIT) ||
1491 zone->all_unreclaimable ||
1492 atomic_read(&zone->reclaim_in_progress) > 0 ||
1493 (current->flags & PF_MEMALLOC))
1494 return 0;
1495
1496 /*
1497 * Only run zone reclaim on the local zone or on zones that do not
1498 * have associated processors. This will favor the local processor
1499 * over remote processors and spread off node memory allocations
1500 * as wide as possible.
1501 */
1502 node_id = zone->zone_pgdat->node_id;
1503 mask = node_to_cpumask(node_id);
1504 if (!cpus_empty(mask) && node_id != numa_node_id())
1505 return 0;
1506 return __zone_reclaim(zone, gfp_mask, order);
1507}
1508#endif
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 74cb79eb917e..f6940618e345 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -16,11 +16,12 @@
16#include <linux/keyctl.h> 16#include <linux/keyctl.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/mutex.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include "internal.h" 21#include "internal.h"
21 22
22/* session keyring create vs join semaphore */ 23/* session keyring create vs join semaphore */
23static DECLARE_MUTEX(key_session_sem); 24static DEFINE_MUTEX(key_session_mutex);
24 25
25/* the root user's tracking struct */ 26/* the root user's tracking struct */
26struct key_user root_key_user = { 27struct key_user root_key_user = {
@@ -711,7 +712,7 @@ long join_session_keyring(const char *name)
711 } 712 }
712 713
713 /* allow the user to join or create a named keyring */ 714 /* allow the user to join or create a named keyring */
714 down(&key_session_sem); 715 mutex_lock(&key_session_mutex);
715 716
716 /* look for an existing keyring of this name */ 717 /* look for an existing keyring of this name */
717 keyring = find_keyring_by_name(name, 0); 718 keyring = find_keyring_by_name(name, 0);
@@ -737,7 +738,7 @@ long join_session_keyring(const char *name)
737 key_put(keyring); 738 key_put(keyring);
738 739
739error2: 740error2:
740 up(&key_session_sem); 741 mutex_unlock(&key_session_mutex);
741error: 742error:
742 return ret; 743 return ret;
743 744
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5b16196f2823..ccaf988f3729 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -117,6 +117,8 @@ static struct security_operations *secondary_ops = NULL;
117static LIST_HEAD(superblock_security_head); 117static LIST_HEAD(superblock_security_head);
118static DEFINE_SPINLOCK(sb_security_lock); 118static DEFINE_SPINLOCK(sb_security_lock);
119 119
120static kmem_cache_t *sel_inode_cache;
121
120/* Allocate and free functions for each kind of security blob. */ 122/* Allocate and free functions for each kind of security blob. */
121 123
122static int task_alloc_security(struct task_struct *task) 124static int task_alloc_security(struct task_struct *task)
@@ -146,10 +148,11 @@ static int inode_alloc_security(struct inode *inode)
146 struct task_security_struct *tsec = current->security; 148 struct task_security_struct *tsec = current->security;
147 struct inode_security_struct *isec; 149 struct inode_security_struct *isec;
148 150
149 isec = kzalloc(sizeof(struct inode_security_struct), GFP_KERNEL); 151 isec = kmem_cache_alloc(sel_inode_cache, SLAB_KERNEL);
150 if (!isec) 152 if (!isec)
151 return -ENOMEM; 153 return -ENOMEM;
152 154
155 memset(isec, 0, sizeof(*isec));
153 init_MUTEX(&isec->sem); 156 init_MUTEX(&isec->sem);
154 INIT_LIST_HEAD(&isec->list); 157 INIT_LIST_HEAD(&isec->list);
155 isec->inode = inode; 158 isec->inode = inode;
@@ -172,7 +175,7 @@ static void inode_free_security(struct inode *inode)
172 spin_unlock(&sbsec->isec_lock); 175 spin_unlock(&sbsec->isec_lock);
173 176
174 inode->i_security = NULL; 177 inode->i_security = NULL;
175 kfree(isec); 178 kmem_cache_free(sel_inode_cache, isec);
176} 179}
177 180
178static int file_alloc_security(struct file *file) 181static int file_alloc_security(struct file *file)
@@ -1929,7 +1932,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
1929 struct task_security_struct *tsec; 1932 struct task_security_struct *tsec;
1930 struct inode_security_struct *dsec; 1933 struct inode_security_struct *dsec;
1931 struct superblock_security_struct *sbsec; 1934 struct superblock_security_struct *sbsec;
1932 struct inode_security_struct *isec;
1933 u32 newsid, clen; 1935 u32 newsid, clen;
1934 int rc; 1936 int rc;
1935 char *namep = NULL, *context; 1937 char *namep = NULL, *context;
@@ -1937,7 +1939,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
1937 tsec = current->security; 1939 tsec = current->security;
1938 dsec = dir->i_security; 1940 dsec = dir->i_security;
1939 sbsec = dir->i_sb->s_security; 1941 sbsec = dir->i_sb->s_security;
1940 isec = inode->i_security;
1941 1942
1942 if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) { 1943 if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) {
1943 newsid = tsec->create_sid; 1944 newsid = tsec->create_sid;
@@ -1957,7 +1958,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
1957 1958
1958 inode_security_set_sid(inode, newsid); 1959 inode_security_set_sid(inode, newsid);
1959 1960
1960 if (sbsec->behavior == SECURITY_FS_USE_MNTPOINT) 1961 if (!ss_initialized || sbsec->behavior == SECURITY_FS_USE_MNTPOINT)
1961 return -EOPNOTSUPP; 1962 return -EOPNOTSUPP;
1962 1963
1963 if (name) { 1964 if (name) {
@@ -4408,6 +4409,9 @@ static __init int selinux_init(void)
4408 tsec = current->security; 4409 tsec = current->security;
4409 tsec->osid = tsec->sid = SECINITSID_KERNEL; 4410 tsec->osid = tsec->sid = SECINITSID_KERNEL;
4410 4411
4412 sel_inode_cache = kmem_cache_create("selinux_inode_security",
4413 sizeof(struct inode_security_struct),
4414 0, SLAB_PANIC, NULL, NULL);
4411 avc_init(); 4415 avc_init();
4412 4416
4413 original_ops = secondary_ops = security_ops; 4417 original_ops = secondary_ops = security_ops;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index b5fa02d17b1e..f5d78365488f 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/mutex.h>
18#include <linux/init.h> 19#include <linux/init.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/security.h> 21#include <linux/security.h>
@@ -44,7 +45,7 @@ static int __init checkreqprot_setup(char *str)
44__setup("checkreqprot=", checkreqprot_setup); 45__setup("checkreqprot=", checkreqprot_setup);
45 46
46 47
47static DECLARE_MUTEX(sel_sem); 48static DEFINE_MUTEX(sel_mutex);
48 49
49/* global data for booleans */ 50/* global data for booleans */
50static struct dentry *bool_dir = NULL; 51static struct dentry *bool_dir = NULL;
@@ -230,7 +231,7 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
230 ssize_t length; 231 ssize_t length;
231 void *data = NULL; 232 void *data = NULL;
232 233
233 down(&sel_sem); 234 mutex_lock(&sel_mutex);
234 235
235 length = task_has_security(current, SECURITY__LOAD_POLICY); 236 length = task_has_security(current, SECURITY__LOAD_POLICY);
236 if (length) 237 if (length)
@@ -262,7 +263,7 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
262 else 263 else
263 length = count; 264 length = count;
264out: 265out:
265 up(&sel_sem); 266 mutex_unlock(&sel_mutex);
266 vfree(data); 267 vfree(data);
267 return length; 268 return length;
268} 269}
@@ -709,12 +710,11 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
709{ 710{
710 char *page = NULL; 711 char *page = NULL;
711 ssize_t length; 712 ssize_t length;
712 ssize_t end;
713 ssize_t ret; 713 ssize_t ret;
714 int cur_enforcing; 714 int cur_enforcing;
715 struct inode *inode; 715 struct inode *inode;
716 716
717 down(&sel_sem); 717 mutex_lock(&sel_mutex);
718 718
719 ret = -EFAULT; 719 ret = -EFAULT;
720 720
@@ -740,26 +740,9 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
740 740
741 length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, 741 length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing,
742 bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]); 742 bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]);
743 if (length < 0) { 743 ret = simple_read_from_buffer(buf, count, ppos, page, length);
744 ret = length;
745 goto out;
746 }
747
748 if (*ppos >= length) {
749 ret = 0;
750 goto out;
751 }
752 if (count + *ppos > length)
753 count = length - *ppos;
754 end = count + *ppos;
755 if (copy_to_user(buf, (char *) page + *ppos, count)) {
756 ret = -EFAULT;
757 goto out;
758 }
759 *ppos = end;
760 ret = count;
761out: 744out:
762 up(&sel_sem); 745 mutex_unlock(&sel_mutex);
763 if (page) 746 if (page)
764 free_page((unsigned long)page); 747 free_page((unsigned long)page);
765 return ret; 748 return ret;
@@ -773,7 +756,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
773 int new_value; 756 int new_value;
774 struct inode *inode; 757 struct inode *inode;
775 758
776 down(&sel_sem); 759 mutex_lock(&sel_mutex);
777 760
778 length = task_has_security(current, SECURITY__SETBOOL); 761 length = task_has_security(current, SECURITY__SETBOOL);
779 if (length) 762 if (length)
@@ -812,7 +795,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
812 length = count; 795 length = count;
813 796
814out: 797out:
815 up(&sel_sem); 798 mutex_unlock(&sel_mutex);
816 if (page) 799 if (page)
817 free_page((unsigned long) page); 800 free_page((unsigned long) page);
818 return length; 801 return length;
@@ -831,7 +814,7 @@ static ssize_t sel_commit_bools_write(struct file *filep,
831 ssize_t length = -EFAULT; 814 ssize_t length = -EFAULT;
832 int new_value; 815 int new_value;
833 816
834 down(&sel_sem); 817 mutex_lock(&sel_mutex);
835 818
836 length = task_has_security(current, SECURITY__SETBOOL); 819 length = task_has_security(current, SECURITY__SETBOOL);
837 if (length) 820 if (length)
@@ -869,7 +852,7 @@ static ssize_t sel_commit_bools_write(struct file *filep,
869 length = count; 852 length = count;
870 853
871out: 854out:
872 up(&sel_sem); 855 mutex_unlock(&sel_mutex);
873 if (page) 856 if (page)
874 free_page((unsigned long) page); 857 free_page((unsigned long) page);
875 return length; 858 return length;
@@ -987,7 +970,7 @@ out:
987 return ret; 970 return ret;
988err: 971err:
989 kfree(values); 972 kfree(values);
990 d_genocide(dir); 973 sel_remove_bools(dir);
991 ret = -ENOMEM; 974 ret = -ENOMEM;
992 goto out; 975 goto out;
993} 976}
@@ -1168,37 +1151,38 @@ static int sel_make_avc_files(struct dentry *dir)
1168 dentry = d_alloc_name(dir, files[i].name); 1151 dentry = d_alloc_name(dir, files[i].name);
1169 if (!dentry) { 1152 if (!dentry) {
1170 ret = -ENOMEM; 1153 ret = -ENOMEM;
1171 goto err; 1154 goto out;
1172 } 1155 }
1173 1156
1174 inode = sel_make_inode(dir->d_sb, S_IFREG|files[i].mode); 1157 inode = sel_make_inode(dir->d_sb, S_IFREG|files[i].mode);
1175 if (!inode) { 1158 if (!inode) {
1176 ret = -ENOMEM; 1159 ret = -ENOMEM;
1177 goto err; 1160 goto out;
1178 } 1161 }
1179 inode->i_fop = files[i].ops; 1162 inode->i_fop = files[i].ops;
1180 d_add(dentry, inode); 1163 d_add(dentry, inode);
1181 } 1164 }
1182out: 1165out:
1183 return ret; 1166 return ret;
1184err:
1185 d_genocide(dir);
1186 goto out;
1187} 1167}
1188 1168
1189static int sel_make_dir(struct super_block *sb, struct dentry *dentry) 1169static int sel_make_dir(struct inode *dir, struct dentry *dentry)
1190{ 1170{
1191 int ret = 0; 1171 int ret = 0;
1192 struct inode *inode; 1172 struct inode *inode;
1193 1173
1194 inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); 1174 inode = sel_make_inode(dir->i_sb, S_IFDIR | S_IRUGO | S_IXUGO);
1195 if (!inode) { 1175 if (!inode) {
1196 ret = -ENOMEM; 1176 ret = -ENOMEM;
1197 goto out; 1177 goto out;
1198 } 1178 }
1199 inode->i_op = &simple_dir_inode_operations; 1179 inode->i_op = &simple_dir_inode_operations;
1200 inode->i_fop = &simple_dir_operations; 1180 inode->i_fop = &simple_dir_operations;
1181 /* directory inodes start off with i_nlink == 2 (for "." entry) */
1182 inode->i_nlink++;
1201 d_add(dentry, inode); 1183 d_add(dentry, inode);
1184 /* bump link count on parent directory, too */
1185 dir->i_nlink++;
1202out: 1186out:
1203 return ret; 1187 return ret;
1204} 1188}
@@ -1207,7 +1191,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1207{ 1191{
1208 int ret; 1192 int ret;
1209 struct dentry *dentry; 1193 struct dentry *dentry;
1210 struct inode *inode; 1194 struct inode *inode, *root_inode;
1211 struct inode_security_struct *isec; 1195 struct inode_security_struct *isec;
1212 1196
1213 static struct tree_descr selinux_files[] = { 1197 static struct tree_descr selinux_files[] = {
@@ -1228,30 +1212,33 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1228 }; 1212 };
1229 ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); 1213 ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files);
1230 if (ret) 1214 if (ret)
1231 return ret; 1215 goto err;
1216
1217 root_inode = sb->s_root->d_inode;
1232 1218
1233 dentry = d_alloc_name(sb->s_root, BOOL_DIR_NAME); 1219 dentry = d_alloc_name(sb->s_root, BOOL_DIR_NAME);
1234 if (!dentry) 1220 if (!dentry) {
1235 return -ENOMEM; 1221 ret = -ENOMEM;
1222 goto err;
1223 }
1236 1224
1237 inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); 1225 ret = sel_make_dir(root_inode, dentry);
1238 if (!inode)
1239 goto out;
1240 inode->i_op = &simple_dir_inode_operations;
1241 inode->i_fop = &simple_dir_operations;
1242 d_add(dentry, inode);
1243 bool_dir = dentry;
1244 ret = sel_make_bools();
1245 if (ret) 1226 if (ret)
1246 goto out; 1227 goto err;
1228
1229 bool_dir = dentry;
1247 1230
1248 dentry = d_alloc_name(sb->s_root, NULL_FILE_NAME); 1231 dentry = d_alloc_name(sb->s_root, NULL_FILE_NAME);
1249 if (!dentry) 1232 if (!dentry) {
1250 return -ENOMEM; 1233 ret = -ENOMEM;
1234 goto err;
1235 }
1251 1236
1252 inode = sel_make_inode(sb, S_IFCHR | S_IRUGO | S_IWUGO); 1237 inode = sel_make_inode(sb, S_IFCHR | S_IRUGO | S_IWUGO);
1253 if (!inode) 1238 if (!inode) {
1254 goto out; 1239 ret = -ENOMEM;
1240 goto err;
1241 }
1255 isec = (struct inode_security_struct*)inode->i_security; 1242 isec = (struct inode_security_struct*)inode->i_security;
1256 isec->sid = SECINITSID_DEVNULL; 1243 isec->sid = SECINITSID_DEVNULL;
1257 isec->sclass = SECCLASS_CHR_FILE; 1244 isec->sclass = SECCLASS_CHR_FILE;
@@ -1262,22 +1249,23 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1262 selinux_null = dentry; 1249 selinux_null = dentry;
1263 1250
1264 dentry = d_alloc_name(sb->s_root, "avc"); 1251 dentry = d_alloc_name(sb->s_root, "avc");
1265 if (!dentry) 1252 if (!dentry) {
1266 return -ENOMEM; 1253 ret = -ENOMEM;
1254 goto err;
1255 }
1267 1256
1268 ret = sel_make_dir(sb, dentry); 1257 ret = sel_make_dir(root_inode, dentry);
1269 if (ret) 1258 if (ret)
1270 goto out; 1259 goto err;
1271 1260
1272 ret = sel_make_avc_files(dentry); 1261 ret = sel_make_avc_files(dentry);
1273 if (ret) 1262 if (ret)
1274 goto out; 1263 goto err;
1275
1276 return 0;
1277out: 1264out:
1278 dput(dentry); 1265 return ret;
1266err:
1279 printk(KERN_ERR "%s: failed while creating inodes\n", __FUNCTION__); 1267 printk(KERN_ERR "%s: failed while creating inodes\n", __FUNCTION__);
1280 return -ENOMEM; 1268 goto out;
1281} 1269}
1282 1270
1283static struct super_block *sel_get_sb(struct file_system_type *fs_type, 1271static struct super_block *sel_get_sb(struct file_system_type *fs_type,
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 8a764928ff4b..63e0b7f29cb5 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -27,7 +27,8 @@
27#include <linux/in.h> 27#include <linux/in.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/audit.h> 29#include <linux/audit.h>
30#include <asm/semaphore.h> 30#include <linux/mutex.h>
31
31#include "flask.h" 32#include "flask.h"
32#include "avc.h" 33#include "avc.h"
33#include "avc_ss.h" 34#include "avc_ss.h"
@@ -48,9 +49,9 @@ static DEFINE_RWLOCK(policy_rwlock);
48#define POLICY_RDUNLOCK read_unlock(&policy_rwlock) 49#define POLICY_RDUNLOCK read_unlock(&policy_rwlock)
49#define POLICY_WRUNLOCK write_unlock_irq(&policy_rwlock) 50#define POLICY_WRUNLOCK write_unlock_irq(&policy_rwlock)
50 51
51static DECLARE_MUTEX(load_sem); 52static DEFINE_MUTEX(load_mutex);
52#define LOAD_LOCK down(&load_sem) 53#define LOAD_LOCK mutex_lock(&load_mutex)
53#define LOAD_UNLOCK up(&load_sem) 54#define LOAD_UNLOCK mutex_unlock(&load_mutex)
54 55
55static struct sidtab sidtab; 56static struct sidtab sidtab;
56struct policydb policydb; 57struct policydb policydb;