aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/Intel-IOMMU.txt115
-rw-r--r--Documentation/filesystems/Exporting115
-rw-r--r--Documentation/i386/boot.txt34
-rw-r--r--Documentation/kernel-parameters.txt17
-rw-r--r--Documentation/memory-hotplug.txt58
-rw-r--r--arch/ia64/kernel/efi.c4
-rw-r--r--arch/ia64/kernel/setup.c14
-rw-r--r--arch/x86/boot/compressed/head_32.S15
-rw-r--r--arch/x86/boot/compressed/misc_32.c3
-rw-r--r--arch/x86/boot/header.S7
-rw-r--r--arch/x86/kernel/asm-offsets_32.c7
-rw-r--r--arch/x86/kernel/e820_32.c18
-rw-r--r--arch/x86/kernel/e820_64.c22
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/head_32.S44
-rw-r--r--arch/x86/kernel/io_apic_64.c59
-rw-r--r--arch/x86/kernel/pci-dma_64.c5
-rw-r--r--arch/x86/kernel/setup_32.c4
-rw-r--r--arch/x86/kernel/setup_64.c9
-rw-r--r--arch/x86/mm/pageattr_64.c6
-rw-r--r--arch/x86_64/Kconfig32
-rw-r--r--drivers/base/memory.c9
-rw-r--r--drivers/pci/Makefile3
-rw-r--r--drivers/pci/dmar.c329
-rw-r--r--drivers/pci/intel-iommu.c2271
-rw-r--r--drivers/pci/intel-iommu.h325
-rw-r--r--drivers/pci/iova.c394
-rw-r--r--drivers/pci/iova.h63
-rw-r--r--drivers/pci/pci.h1
-rw-r--r--drivers/pci/probe.c14
-rw-r--r--drivers/pci/search.c34
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/efs/namei.c36
-rw-r--r--fs/efs/super.c5
-rw-r--r--fs/exportfs/expfs.c360
-rw-r--r--fs/ext2/dir.c44
-rw-r--r--fs/ext2/super.c36
-rw-r--r--fs/ext3/super.c37
-rw-r--r--fs/ext4/super.c37
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/gfs2/ops_export.c83
-rw-r--r--fs/gfs2/ops_fstype.h2
-rw-r--r--fs/isofs/export.c69
-rw-r--r--fs/isofs/isofs.h2
-rw-r--r--fs/jfs/jfs_inode.h7
-rw-r--r--fs/jfs/namei.c35
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/libfs.c88
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/nfsfh.c67
-rw-r--r--fs/ntfs/namei.c77
-rw-r--r--fs/ntfs/ntfs.h2
-rw-r--r--fs/ocfs2/export.c67
-rw-r--r--fs/ocfs2/export.h2
-rw-r--r--fs/reiserfs/inode.c62
-rw-r--r--fs/reiserfs/super.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c206
-rw-r--r--fs/xfs/linux-2.6/xfs_export.h50
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--include/acpi/actbl1.h27
-rw-r--r--include/asm-x86/bootparam.h9
-rw-r--r--include/asm-x86/cacheflush.h1
-rw-r--r--include/asm-x86/device.h3
-rw-r--r--include/linux/capability.h6
-rw-r--r--include/linux/dmar.h86
-rw-r--r--include/linux/efi.h2
-rw-r--r--include/linux/efs_fs.h6
-rw-r--r--include/linux/exportfs.h141
-rw-r--r--include/linux/ext2_fs.h1
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/linkage.h6
-rw-r--r--include/linux/memory.h31
-rw-r--r--include/linux/pci.h2
-rw-r--r--include/linux/reiserfs_fs.h12
-rw-r--r--mm/memory_hotplug.c48
-rw-r--r--mm/shmem.c37
-rw-r--r--mm/slub.c118
-rw-r--r--security/commoncap.c23
80 files changed, 5088 insertions, 937 deletions
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 00000000000..c2321903aa0
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,115 @@
1Linux IOMMU Support
2===================
3
4The architecture spec can be obtained from the below location.
5
6http://www.intel.com/technology/virtualization/
7
8This guide gives a quick cheat sheet for some basic understanding.
9
10Some Keywords
11
12DMAR - DMA remapping
13DRHD - DMA Engine Reporting Structure
14RMRR - Reserved memory Region Reporting Structure
15ZLR - Zero length reads from PCI devices
16IOVA - IO Virtual address.
17
18Basic stuff
19-----------
20
21ACPI enumerates and lists the different DMA engines in the platform, and
22device scope relationships between PCI devices and which DMA engine controls
23them.
24
25What is RMRR?
26-------------
27
28There are some devices the BIOS controls, for e.g USB devices to perform
29PS2 emulation. The regions of memory used for these devices are marked
30reserved in the e820 map. When we turn on DMA translation, DMA to those
31regions will fail. Hence BIOS uses RMRR to specify these regions along with
32devices that need to access these regions. OS is expected to setup
33unity mappings for these regions for these devices to access these regions.
34
35How is IOVA generated?
36---------------------
37
38Well behaved drivers call pci_map_*() calls before sending command to device
39that needs to perform DMA. Once DMA is completed and mapping is no longer
40required, device performs a pci_unmap_*() calls to unmap the region.
41
42The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
43device has its own domain (hence protection). Devices under p2p bridges
44share the virtual address with all devices under the p2p bridge due to
45transaction id aliasing for p2p bridges.
46
47IOVA generation is pretty generic. We used the same technique as vmalloc()
48but these are not global address spaces, but separate for each domain.
49Different DMA engines may support different number of domains.
50
51We also allocate gaurd pages with each mapping, so we can attempt to catch
52any overflow that might happen.
53
54
55Graphics Problems?
56------------------
57If you encounter issues with graphics devices, you can try adding
58option intel_iommu=igfx_off to turn off the integrated graphics engine.
59
60If it happens to be a PCI device included in the INCLUDE_ALL Engine,
61then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
62graphics drivers may be in process of using DMA api's in the near
63future and at that time this option can be yanked out.
64
65Some exceptions to IOVA
66-----------------------
67Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
68The same is true for peer to peer transactions. Hence we reserve the
69address from PCI MMIO ranges so they are not allocated for IOVA addresses.
70
71
72Fault reporting
73---------------
74When errors are reported, the DMA engine signals via an interrupt. The fault
75reason and device that caused it with fault reason is printed on console.
76
77See below for sample.
78
79
80Boot Message Sample
81-------------------
82
83Something like this gets printed indicating presence of DMAR tables
84in ACPI.
85
86ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
87
88When DMAR is being processed and initialized by ACPI, prints DMAR locations
89and any RMRR's processed.
90
91ACPI DMAR:Host address width 36
92ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
93ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
94ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
95ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
96ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
97
98When DMAR is enabled for use, you will notice..
99
100PCI-DMA: Using DMAR IOMMU
101
102Fault reporting
103---------------
104
105DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
106DMAR:[fault reason 05] PTE Write access is not set
107DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
108DMAR:[fault reason 05] PTE Write access is not set
109
110TBD
111----
112
113- For compatibility testing, could use unity map domain for all devices, just
114 provide a 1-1 for all useful memory under a single domain for all devices.
115- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting
index 31047e0fe14..87019d2b598 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/Exporting
@@ -2,9 +2,12 @@
2Making Filesystems Exportable 2Making Filesystems Exportable
3============================= 3=============================
4 4
5Most filesystem operations require a dentry (or two) as a starting 5Overview
6--------
7
8All filesystem operations require a dentry (or two) as a starting
6point. Local applications have a reference-counted hold on suitable 9point. Local applications have a reference-counted hold on suitable
7dentrys via open file descriptors or cwd/root. However remote 10dentries via open file descriptors or cwd/root. However remote
8applications that access a filesystem via a remote filesystem protocol 11applications that access a filesystem via a remote filesystem protocol
9such as NFS may not be able to hold such a reference, and so need a 12such as NFS may not be able to hold such a reference, and so need a
10different way to refer to a particular dentry. As the alternative 13different way to refer to a particular dentry. As the alternative
@@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most
13problematic), there is no simple answer like 'filename'. 16problematic), there is no simple answer like 'filename'.
14 17
15The mechanism discussed here allows each filesystem implementation to 18The mechanism discussed here allows each filesystem implementation to
16specify how to generate an opaque (out side of the filesystem) byte 19specify how to generate an opaque (outside of the filesystem) byte
17string for any dentry, and how to find an appropriate dentry for any 20string for any dentry, and how to find an appropriate dentry for any
18given opaque byte string. 21given opaque byte string.
19This byte string will be called a "filehandle fragment" as it 22This byte string will be called a "filehandle fragment" as it
20corresponds to part of an NFS filehandle. 23corresponds to part of an NFS filehandle.
21 24
22A filesystem which supports the mapping between filehandle fragments 25A filesystem which supports the mapping between filehandle fragments
23and dentrys will be termed "exportable". 26and dentries will be termed "exportable".
24 27
25 28
26 29
@@ -89,11 +92,9 @@ For a filesystem to be exportable it must:
89 1/ provide the filehandle fragment routines described below. 92 1/ provide the filehandle fragment routines described below.
90 2/ make sure that d_splice_alias is used rather than d_add 93 2/ make sure that d_splice_alias is used rather than d_add
91 when ->lookup finds an inode for a given parent and name. 94 when ->lookup finds an inode for a given parent and name.
92 Typically the ->lookup routine will end: 95 Typically the ->lookup routine will end with a:
93 if (inode) 96
94 return d_splice(inode, dentry); 97 return d_splice_alias(inode, dentry);
95 d_add(dentry, inode);
96 return NULL;
97 } 98 }
98 99
99 100
@@ -101,67 +102,39 @@ For a filesystem to be exportable it must:
101 A file system implementation declares that instances of the filesystem 102 A file system implementation declares that instances of the filesystem
102are exportable by setting the s_export_op field in the struct 103are exportable by setting the s_export_op field in the struct
103super_block. This field must point to a "struct export_operations" 104super_block. This field must point to a "struct export_operations"
104struct which could potentially be full of NULLs, though normally at 105struct which has the following members:
105least get_parent will be set. 106
106 107 encode_fh (optional)
107 The primary operations are decode_fh and encode_fh. 108 Takes a dentry and creates a filehandle fragment which can later be used
108decode_fh takes a filehandle fragment and tries to find or create a 109 to find or create a dentry for the same object. The default
109dentry for the object referred to by the filehandle. 110 implementation creates a filehandle fragment that encodes a 32bit inode
110encode_fh takes a dentry and creates a filehandle fragment which can 111 and generation number for the inode encoded, and if necessary the
111later be used to find/create a dentry for the same object. 112 same information for the parent.
112 113
113decode_fh will probably make use of "find_exported_dentry". 114 fh_to_dentry (mandatory)
114This function lives in the "exportfs" module which a filesystem does 115 Given a filehandle fragment, this should find the implied object and
115not need unless it is being exported. So rather that calling 116 create a dentry for it (possibly with d_alloc_anon).
116find_exported_dentry directly, each filesystem should call it through 117
117the find_exported_dentry pointer in it's export_operations table. 118 fh_to_parent (optional but strongly recommended)
118This field is set correctly by the exporting agent (e.g. nfsd) when a 119 Given a filehandle fragment, this should find the parent of the
119filesystem is exported, and before any export operations are called. 120 implied object and create a dentry for it (possibly with d_alloc_anon).
120 121 May fail if the filehandle fragment is too small.
121find_exported_dentry needs three support functions from the 122
122filesystem: 123 get_parent (optional but strongly recommended)
123 get_name. When given a parent dentry and a child dentry, this 124 When given a dentry for a directory, this should return a dentry for
124 should find a name in the directory identified by the parent 125 the parent. Quite possibly the parent dentry will have been allocated
125 dentry, which leads to the object identified by the child dentry. 126 by d_alloc_anon. The default get_parent function just returns an error
126 If no get_name function is supplied, a default implementation is 127 so any filehandle lookup that requires finding a parent will fail.
127 provided which uses vfs_readdir to find potential names, and 128 ->lookup("..") is *not* used as a default as it can leave ".." entries
128 matches inode numbers to find the correct match. 129 in the dcache which are too messy to work with.
129 130
130 get_parent. When given a dentry for a directory, this should return 131 get_name (optional)
131 a dentry for the parent. Quite possibly the parent dentry will 132 When given a parent dentry and a child dentry, this should find a name
132 have been allocated by d_alloc_anon. 133 in the directory identified by the parent dentry, which leads to the
133 The default get_parent function just returns an error so any 134 object identified by the child dentry. If no get_name function is
134 filehandle lookup that requires finding a parent will fail. 135 supplied, a default implementation is provided which uses vfs_readdir
135 ->lookup("..") is *not* used as a default as it can leave ".." 136 to find potential names, and matches inode numbers to find the correct
136 entries in the dcache which are too messy to work with. 137 match.
137
138 get_dentry. When given an opaque datum, this should find the
139 implied object and create a dentry for it (possibly with
140 d_alloc_anon).
141 The opaque datum is whatever is passed down by the decode_fh
142 function, and is often simply a fragment of the filehandle
143 fragment.
144 decode_fh passes two datums through find_exported_dentry. One that
145 should be used to identify the target object, and one that can be
146 used to identify the object's parent, should that be necessary.
147 The default get_dentry function assumes that the datum contains an
148 inode number and a generation number, and it attempts to get the
149 inode using "iget" and check it's validity by matching the
150 generation number. A filesystem should only depend on the default
151 if iget can safely be used this way.
152
153If decode_fh and/or encode_fh are left as NULL, then default
154implementations are used. These defaults are suitable for ext2 and
155extremely similar filesystems (like ext3).
156
157The default encode_fh creates a filehandle fragment from the inode
158number and generation number of the target together with the inode
159number and generation number of the parent (if the parent is
160required).
161
162The default decode_fh extract the target and parent datums from the
163filehandle assuming the format used by the default encode_fh and
164passed them to find_exported_dentry.
165 138
166 139
167A filehandle fragment consists of an array of 1 or more 4byte words, 140A filehandle fragment consists of an array of 1 or more 4byte words,
@@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with
172nuls. Rather, the encode_fh routine should choose a "type" which 145nuls. Rather, the encode_fh routine should choose a "type" which
173indicates the decode_fh how much of the filehandle is valid, and how 146indicates the decode_fh how much of the filehandle is valid, and how
174it should be interpreted. 147it should be interpreted.
175
176
diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index 35985b34d5a..2f75e750e4f 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -168,6 +168,8 @@ Offset Proto Name Meaning
1680234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1680234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1690235/3 N/A pad2 Unused 1690235/3 N/A pad2 Unused
1700238/4 2.06+ cmdline_size Maximum size of the kernel command line 1700238/4 2.06+ cmdline_size Maximum size of the kernel command line
171023C/4 2.07+ hardware_subarch Hardware subarchitecture
1720240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
171 173
172(1) For backwards compatibility, if the setup_sects field contains 0, the 174(1) For backwards compatibility, if the setup_sects field contains 0, the
173 real value is 4. 175 real value is 4.
@@ -204,7 +206,7 @@ boot loaders can ignore those fields.
204 206
205The byte order of all fields is littleendian (this is x86, after all.) 207The byte order of all fields is littleendian (this is x86, after all.)
206 208
207Field name: setup_secs 209Field name: setup_sects
208Type: read 210Type: read
209Offset/size: 0x1f1/1 211Offset/size: 0x1f1/1
210Protocol: ALL 212Protocol: ALL
@@ -356,6 +358,13 @@ Protocol: 2.00+
356 - If 0, the protected-mode code is loaded at 0x10000. 358 - If 0, the protected-mode code is loaded at 0x10000.
357 - If 1, the protected-mode code is loaded at 0x100000. 359 - If 1, the protected-mode code is loaded at 0x100000.
358 360
361 Bit 6 (write): KEEP_SEGMENTS
362 Protocol: 2.07+
363 - if 0, reload the segment registers in the 32bit entry point.
364 - if 1, do not reload the segment registers in the 32bit entry point.
365 Assume that %cs %ds %ss %es are all set to flat segments with
366 a base of 0 (or the equivalent for their environment).
367
359 Bit 7 (write): CAN_USE_HEAP 368 Bit 7 (write): CAN_USE_HEAP
360 Set this bit to 1 to indicate that the value entered in the 369 Set this bit to 1 to indicate that the value entered in the
361 heap_end_ptr is valid. If this field is clear, some setup code 370 heap_end_ptr is valid. If this field is clear, some setup code
@@ -480,6 +489,29 @@ Protocol: 2.06+
480 cmdline_size characters. With protocol version 2.05 and earlier, the 489 cmdline_size characters. With protocol version 2.05 and earlier, the
481 maximum size was 255. 490 maximum size was 255.
482 491
492Field name: hardware_subarch
493Type: write
494Offset/size: 0x23c/4
495Protocol: 2.07+
496
497 In a paravirtualized environment the hardware low level architectural
498 pieces such as interrupt handling, page table handling, and
499 accessing process control registers needs to be done differently.
500
501 This field allows the bootloader to inform the kernel we are in one
502 one of those environments.
503
504 0x00000000 The default x86/PC environment
505 0x00000001 lguest
506 0x00000002 Xen
507
508Field name: hardware_subarch_data
509Type: write
510Offset/size: 0x240/8
511Protocol: 2.07+
512
513 A pointer to data that is specific to hardware subarch
514
483 515
484**** THE KERNEL COMMAND LINE 516**** THE KERNEL COMMAND LINE
485 517
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6accd360da7..b2361667839 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -772,6 +772,23 @@ and is between 256 and 4096 characters. It is defined in the file
772 772
773 inttest= [IA64] 773 inttest= [IA64]
774 774
775 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
776 off
777 Disable intel iommu driver.
778 igfx_off [Default Off]
779 By default, gfx is mapped as normal device. If a gfx
780 device has a dedicated DMAR unit, the DMAR unit is
781 bypassed by not enabling DMAR with this option. In
782 this case, gfx device will use physical address for
783 DMA.
784 forcedac [x86_64]
785 With this option iommu will not optimize to look
786 for io virtual address below 32 bit forcing dual
787 address cycle on pci bus for cards supporting greater
788 than 32 bit addressing. The default is to look
789 for translation below 32 bit and if not available
790 then look in the higher range.
791
775 io7= [HW] IO7 for Marvel based alpha systems 792 io7= [HW] IO7 for Marvel based alpha systems
776 See comment before marvel_specify_io7 in 793 See comment before marvel_specify_io7 in
777 arch/alpha/kernel/core_marvel.c. 794 arch/alpha/kernel/core_marvel.c.
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 5fbcc22c98e..168117bd6ee 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -2,7 +2,8 @@
2Memory Hotplug 2Memory Hotplug
3============== 3==============
4 4
5Last Updated: Jul 28 2007 5Created: Jul 28 2007
6Add description of notifier of memory hotplug Oct 11 2007
6 7
7This document is about memory hotplug including how-to-use and current status. 8This document is about memory hotplug including how-to-use and current status.
8Because Memory Hotplug is still under development, contents of this text will 9Because Memory Hotplug is still under development, contents of this text will
@@ -24,7 +25,8 @@ be changed often.
24 6.1 Memory offline and ZONE_MOVABLE 25 6.1 Memory offline and ZONE_MOVABLE
25 6.2. How to offline memory 26 6.2. How to offline memory
267. Physical memory remove 277. Physical memory remove
278. Future Work List 288. Memory hotplug event notifier
299. Future Work List
28 30
29Note(1): x86_64's has special implementation for memory hotplug. 31Note(1): x86_64's has special implementation for memory hotplug.
30 This text does not describe it. 32 This text does not describe it.
@@ -307,8 +309,58 @@ Need more implementation yet....
307 - Notification completion of remove works by OS to firmware. 309 - Notification completion of remove works by OS to firmware.
308 - Guard from remove if not yet. 310 - Guard from remove if not yet.
309 311
312--------------------------------
3138. Memory hotplug event notifier
314--------------------------------
315Memory hotplug has event notifer. There are 6 types of notification.
316
317MEMORY_GOING_ONLINE
318 Generated before new memory becomes available in order to be able to
319 prepare subsystems to handle memory. The page allocator is still unable
320 to allocate from the new memory.
321
322MEMORY_CANCEL_ONLINE
323 Generated if MEMORY_GOING_ONLINE fails.
324
325MEMORY_ONLINE
326 Generated when memory has succesfully brought online. The callback may
327 allocate pages from the new memory.
328
329MEMORY_GOING_OFFLINE
330 Generated to begin the process of offlining memory. Allocations are no
331 longer possible from the memory but some of the memory to be offlined
332 is still in use. The callback can be used to free memory known to a
333 subsystem from the indicated memory section.
334
335MEMORY_CANCEL_OFFLINE
336 Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
337 the section that we attempted to offline.
338
339MEMORY_OFFLINE
340 Generated after offlining memory is complete.
341
342A callback routine can be registered by
343 hotplug_memory_notifier(callback_func, priority)
344
345The second argument of callback function (action) is event types of above.
346The third argument is passed by pointer of struct memory_notify.
347
348struct memory_notify {
349 unsigned long start_pfn;
350 unsigned long nr_pages;
351 int status_cahnge_nid;
352}
353
354start_pfn is start_pfn of online/offline memory.
355nr_pages is # of pages of online/offline memory.
356status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
357set/clear. It means a new(memoryless) node gets new memory by online and a
358node loses all memory. If this is -1, then nodemask status is not changed.
359If status_changed_nid >= 0, callback should create/discard structures for the
360node if necessary.
361
310-------------- 362--------------
3118. Future Work 3639. Future Work
312-------------- 364--------------
313 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like 365 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
314 sysctl or new control file. 366 sysctl or new control file.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 8e4894b205e..3f7ea13358e 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1090,7 +1090,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
1090 1090
1091void 1091void
1092efi_initialize_iomem_resources(struct resource *code_resource, 1092efi_initialize_iomem_resources(struct resource *code_resource,
1093 struct resource *data_resource) 1093 struct resource *data_resource,
1094 struct resource *bss_resource)
1094{ 1095{
1095 struct resource *res; 1096 struct resource *res;
1096 void *efi_map_start, *efi_map_end, *p; 1097 void *efi_map_start, *efi_map_end, *p;
@@ -1171,6 +1172,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
1171 */ 1172 */
1172 insert_resource(res, code_resource); 1173 insert_resource(res, code_resource);
1173 insert_resource(res, data_resource); 1174 insert_resource(res, data_resource);
1175 insert_resource(res, bss_resource);
1174#ifdef CONFIG_KEXEC 1176#ifdef CONFIG_KEXEC
1175 insert_resource(res, &efi_memmap_res); 1177 insert_resource(res, &efi_memmap_res);
1176 insert_resource(res, &boot_param_res); 1178 insert_resource(res, &boot_param_res);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index cbf67f1aa29..ae6c3c02e11 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -90,7 +90,12 @@ static struct resource code_resource = {
90 .name = "Kernel code", 90 .name = "Kernel code",
91 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 91 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
92}; 92};
93extern char _text[], _end[], _etext[]; 93
94static struct resource bss_resource = {
95 .name = "Kernel bss",
96 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
97};
98extern char _text[], _end[], _etext[], _edata[], _bss[];
94 99
95unsigned long ia64_max_cacheline_size; 100unsigned long ia64_max_cacheline_size;
96 101
@@ -200,8 +205,11 @@ static int __init register_memory(void)
200 code_resource.start = ia64_tpa(_text); 205 code_resource.start = ia64_tpa(_text);
201 code_resource.end = ia64_tpa(_etext) - 1; 206 code_resource.end = ia64_tpa(_etext) - 1;
202 data_resource.start = ia64_tpa(_etext); 207 data_resource.start = ia64_tpa(_etext);
203 data_resource.end = ia64_tpa(_end) - 1; 208 data_resource.end = ia64_tpa(_edata) - 1;
204 efi_initialize_iomem_resources(&code_resource, &data_resource); 209 bss_resource.start = ia64_tpa(_bss);
210 bss_resource.end = ia64_tpa(_end) - 1;
211 efi_initialize_iomem_resources(&code_resource, &data_resource,
212 &bss_resource);
205 213
206 return 0; 214 return 0;
207} 215}
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f35ea223752..a0ae2e7f6ce 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -27,13 +27,22 @@
27#include <asm/segment.h> 27#include <asm/segment.h>
28#include <asm/page.h> 28#include <asm/page.h>
29#include <asm/boot.h> 29#include <asm/boot.h>
30#include <asm/asm-offsets.h>
30 31
31.section ".text.head","ax",@progbits 32.section ".text.head","ax",@progbits
32 .globl startup_32 33 .globl startup_32
33 34
34startup_32: 35startup_32:
35 cld 36 /* check to see if KEEP_SEGMENTS flag is meaningful */
36 cli 37 cmpw $0x207, BP_version(%esi)
38 jb 1f
39
40 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
41 * us to not reload segments */
42 testb $(1<<6), BP_loadflags(%esi)
43 jnz 2f
44
451: cli
37 movl $(__BOOT_DS),%eax 46 movl $(__BOOT_DS),%eax
38 movl %eax,%ds 47 movl %eax,%ds
39 movl %eax,%es 48 movl %eax,%es
@@ -41,6 +50,8 @@ startup_32:
41 movl %eax,%gs 50 movl %eax,%gs
42 movl %eax,%ss 51 movl %eax,%ss
43 52
532: cld
54
44/* Calculate the delta between where we were compiled to run 55/* Calculate the delta between where we were compiled to run
45 * at and where we were actually loaded at. This can only be done 56 * at and where we were actually loaded at. This can only be done
46 * with a short local call on x86. Nothing else will tell us what 57 * with a short local call on x86. Nothing else will tell us what
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
index 1dc1e19c0a9..b74d60d1b2f 100644
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc_32.c
@@ -247,6 +247,9 @@ static void putstr(const char *s)
247 int x,y,pos; 247 int x,y,pos;
248 char c; 248 char c;
249 249
250 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0)
251 return;
252
250 x = RM_SCREEN_INFO.orig_x; 253 x = RM_SCREEN_INFO.orig_x;
251 y = RM_SCREEN_INFO.orig_y; 254 y = RM_SCREEN_INFO.orig_y;
252 255
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index f3140e596d4..8353c81c41c 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -119,7 +119,7 @@ _start:
119 # Part 2 of the header, from the old setup.S 119 # Part 2 of the header, from the old setup.S
120 120
121 .ascii "HdrS" # header signature 121 .ascii "HdrS" # header signature
122 .word 0x0206 # header version number (>= 0x0105) 122 .word 0x0207 # header version number (>= 0x0105)
123 # or else old loadlin-1.5 will fail) 123 # or else old loadlin-1.5 will fail)
124 .globl realmode_swtch 124 .globl realmode_swtch
125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -214,6 +214,11 @@ cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
214 #added with boot protocol 214 #added with boot protocol
215 #version 2.06 215 #version 2.06
216 216
217hardware_subarch: .long 0 # subarchitecture, added with 2.07
218 # default to 0 for normal x86 PC
219
220hardware_subarch_data: .quad 0
221
217# End of setup header ##################################################### 222# End of setup header #####################################################
218 223
219 .section ".inittext", "ax" 224 .section ".inittext", "ax"
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index f1b7cdda82b..f8764716b0c 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -15,6 +15,7 @@
15#include <asm/fixmap.h> 15#include <asm/fixmap.h>
16#include <asm/processor.h> 16#include <asm/processor.h>
17#include <asm/thread_info.h> 17#include <asm/thread_info.h>
18#include <asm/bootparam.h>
18#include <asm/elf.h> 19#include <asm/elf.h>
19 20
20#include <xen/interface/xen.h> 21#include <xen/interface/xen.h>
@@ -146,4 +147,10 @@ void foo(void)
146 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 147 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
147 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 148 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
148#endif 149#endif
150
151 BLANK();
152 OFFSET(BP_scratch, boot_params, scratch);
153 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
154 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
155 OFFSET(BP_version, boot_params, hdr.version);
149} 156}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 58fd54eb557..18f500d185a 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -51,6 +51,13 @@ struct resource code_resource = {
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
52}; 52};
53 53
54struct resource bss_resource = {
55 .name = "Kernel bss",
56 .start = 0,
57 .end = 0,
58 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
59};
60
54static struct resource system_rom_resource = { 61static struct resource system_rom_resource = {
55 .name = "System ROM", 62 .name = "System ROM",
56 .start = 0xf0000, 63 .start = 0xf0000,
@@ -254,7 +261,9 @@ static void __init probe_roms(void)
254 * and also for regions reported as reserved by the e820. 261 * and also for regions reported as reserved by the e820.
255 */ 262 */
256static void __init 263static void __init
257legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) 264legacy_init_iomem_resources(struct resource *code_resource,
265 struct resource *data_resource,
266 struct resource *bss_resource)
258{ 267{
259 int i; 268 int i;
260 269
@@ -287,6 +296,7 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
287 */ 296 */
288 request_resource(res, code_resource); 297 request_resource(res, code_resource);
289 request_resource(res, data_resource); 298 request_resource(res, data_resource);
299 request_resource(res, bss_resource);
290#ifdef CONFIG_KEXEC 300#ifdef CONFIG_KEXEC
291 if (crashk_res.start != crashk_res.end) 301 if (crashk_res.start != crashk_res.end)
292 request_resource(res, &crashk_res); 302 request_resource(res, &crashk_res);
@@ -307,9 +317,11 @@ static int __init request_standard_resources(void)
307 317
308 printk("Setting up standard PCI resources\n"); 318 printk("Setting up standard PCI resources\n");
309 if (efi_enabled) 319 if (efi_enabled)
310 efi_initialize_iomem_resources(&code_resource, &data_resource); 320 efi_initialize_iomem_resources(&code_resource,
321 &data_resource, &bss_resource);
311 else 322 else
312 legacy_init_iomem_resources(&code_resource, &data_resource); 323 legacy_init_iomem_resources(&code_resource,
324 &data_resource, &bss_resource);
313 325
314 /* EFI systems may still have VGA */ 326 /* EFI systems may still have VGA */
315 request_resource(&iomem_resource, &video_ram_resource); 327 request_resource(&iomem_resource, &video_ram_resource);
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 57616865d8a..04698e0b056 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -47,7 +47,7 @@ unsigned long end_pfn_map;
47 */ 47 */
48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; 48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
49 49
50extern struct resource code_resource, data_resource; 50extern struct resource code_resource, data_resource, bss_resource;
51 51
52/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 52/* Check for some hardcoded bad areas that early boot is not allowed to touch */
53static inline int bad_addr(unsigned long *addrp, unsigned long size) 53static inline int bad_addr(unsigned long *addrp, unsigned long size)
@@ -225,6 +225,7 @@ void __init e820_reserve_resources(void)
225 */ 225 */
226 request_resource(res, &code_resource); 226 request_resource(res, &code_resource);
227 request_resource(res, &data_resource); 227 request_resource(res, &data_resource);
228 request_resource(res, &bss_resource);
228#ifdef CONFIG_KEXEC 229#ifdef CONFIG_KEXEC
229 if (crashk_res.start != crashk_res.end) 230 if (crashk_res.start != crashk_res.end)
230 request_resource(res, &crashk_res); 231 request_resource(res, &crashk_res);
@@ -729,3 +730,22 @@ __init void e820_setup_gap(void)
729 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 730 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
730 pci_mem_start, gapstart, gapsize); 731 pci_mem_start, gapstart, gapsize);
731} 732}
733
734int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
735{
736 int i;
737
738 if (slot < 0 || slot >= e820.nr_map)
739 return -1;
740 for (i = slot; i < e820.nr_map; i++) {
741 if (e820.map[i].type != E820_RAM)
742 continue;
743 break;
744 }
745 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
746 return -1;
747 *addr = e820.map[i].addr;
748 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
749 max_pfn << PAGE_SHIFT) - *addr;
750 return i + 1;
751}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index b42558c48e9..e2be78f4939 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -603,7 +603,8 @@ void __init efi_enter_virtual_mode(void)
603 603
604void __init 604void __init
605efi_initialize_iomem_resources(struct resource *code_resource, 605efi_initialize_iomem_resources(struct resource *code_resource,
606 struct resource *data_resource) 606 struct resource *data_resource,
607 struct resource *bss_resource)
607{ 608{
608 struct resource *res; 609 struct resource *res;
609 efi_memory_desc_t *md; 610 efi_memory_desc_t *md;
@@ -675,6 +676,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
675 if (md->type == EFI_CONVENTIONAL_MEMORY) { 676 if (md->type == EFI_CONVENTIONAL_MEMORY) {
676 request_resource(res, code_resource); 677 request_resource(res, code_resource);
677 request_resource(res, data_resource); 678 request_resource(res, data_resource);
679 request_resource(res, bss_resource);
678#ifdef CONFIG_KEXEC 680#ifdef CONFIG_KEXEC
679 request_resource(res, &crashk_res); 681 request_resource(res, &crashk_res);
680#endif 682#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 39677965e16..00b1c2c5645 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -79,22 +79,30 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_
79 */ 79 */
80.section .text.head,"ax",@progbits 80.section .text.head,"ax",@progbits
81ENTRY(startup_32) 81ENTRY(startup_32)
82 /* check to see if KEEP_SEGMENTS flag is meaningful */
83 cmpw $0x207, BP_version(%esi)
84 jb 1f
85
86 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
87 us to not reload segments */
88 testb $(1<<6), BP_loadflags(%esi)
89 jnz 2f
82 90
83/* 91/*
84 * Set segments to known values. 92 * Set segments to known values.
85 */ 93 */
86 cld 941: lgdt boot_gdt_descr - __PAGE_OFFSET
87 lgdt boot_gdt_descr - __PAGE_OFFSET
88 movl $(__BOOT_DS),%eax 95 movl $(__BOOT_DS),%eax
89 movl %eax,%ds 96 movl %eax,%ds
90 movl %eax,%es 97 movl %eax,%es
91 movl %eax,%fs 98 movl %eax,%fs
92 movl %eax,%gs 99 movl %eax,%gs
1002:
93 101
94/* 102/*
95 * Clear BSS first so that there are no surprises... 103 * Clear BSS first so that there are no surprises...
96 * No need to cld as DF is already clear from cld above...
97 */ 104 */
105 cld
98 xorl %eax,%eax 106 xorl %eax,%eax
99 movl $__bss_start - __PAGE_OFFSET,%edi 107 movl $__bss_start - __PAGE_OFFSET,%edi
100 movl $__bss_stop - __PAGE_OFFSET,%ecx 108 movl $__bss_stop - __PAGE_OFFSET,%ecx
@@ -128,6 +136,35 @@ ENTRY(startup_32)
128 movsl 136 movsl
1291: 1371:
130 138
139#ifdef CONFIG_PARAVIRT
140 cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET)
141 jb default_entry
142
143 /* Paravirt-compatible boot parameters. Look to see what architecture
144 we're booting under. */
145 movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax
146 cmpl $num_subarch_entries, %eax
147 jae bad_subarch
148
149 movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax
150 subl $__PAGE_OFFSET, %eax
151 jmp *%eax
152
153bad_subarch:
154WEAK(lguest_entry)
155WEAK(xen_entry)
156 /* Unknown implementation; there's really
157 nothing we can do at this point. */
158 ud2a
159.data
160subarch_entries:
161 .long default_entry /* normal x86/PC */
162 .long lguest_entry /* lguest hypervisor */
163 .long xen_entry /* Xen hypervisor */
164num_subarch_entries = (. - subarch_entries) / 4
165.previous
166#endif /* CONFIG_PARAVIRT */
167
131/* 168/*
132 * Initialize page tables. This creates a PDE and a set of page 169 * Initialize page tables. This creates a PDE and a set of page
133 * tables, which are located immediately beyond _end. The variable 170 * tables, which are located immediately beyond _end. The variable
@@ -140,6 +177,7 @@ ENTRY(startup_32)
140 */ 177 */
141page_pde_offset = (__PAGE_OFFSET >> 20); 178page_pde_offset = (__PAGE_OFFSET >> 20);
142 179
180default_entry:
143 movl $(pg0 - __PAGE_OFFSET), %edi 181 movl $(pg0 - __PAGE_OFFSET), %edi
144 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx 182 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
145 movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ 183 movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index b3c2d268d70..953328b55a3 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -31,6 +31,7 @@
31#include <linux/sysdev.h> 31#include <linux/sysdev.h>
32#include <linux/msi.h> 32#include <linux/msi.h>
33#include <linux/htirq.h> 33#include <linux/htirq.h>
34#include <linux/dmar.h>
34#ifdef CONFIG_ACPI 35#ifdef CONFIG_ACPI
35#include <acpi/acpi_bus.h> 36#include <acpi/acpi_bus.h>
36#endif 37#endif
@@ -2031,8 +2032,64 @@ void arch_teardown_msi_irq(unsigned int irq)
2031 destroy_irq(irq); 2032 destroy_irq(irq);
2032} 2033}
2033 2034
2034#endif /* CONFIG_PCI_MSI */ 2035#ifdef CONFIG_DMAR
2036#ifdef CONFIG_SMP
2037static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2038{
2039 struct irq_cfg *cfg = irq_cfg + irq;
2040 struct msi_msg msg;
2041 unsigned int dest;
2042 cpumask_t tmp;
2043
2044 cpus_and(tmp, mask, cpu_online_map);
2045 if (cpus_empty(tmp))
2046 return;
2047
2048 if (assign_irq_vector(irq, mask))
2049 return;
2050
2051 cpus_and(tmp, cfg->domain, mask);
2052 dest = cpu_mask_to_apicid(tmp);
2053
2054 dmar_msi_read(irq, &msg);
2055
2056 msg.data &= ~MSI_DATA_VECTOR_MASK;
2057 msg.data |= MSI_DATA_VECTOR(cfg->vector);
2058 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2059 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2060
2061 dmar_msi_write(irq, &msg);
2062 irq_desc[irq].affinity = mask;
2063}
2064#endif /* CONFIG_SMP */
2065
2066struct irq_chip dmar_msi_type = {
2067 .name = "DMAR_MSI",
2068 .unmask = dmar_msi_unmask,
2069 .mask = dmar_msi_mask,
2070 .ack = ack_apic_edge,
2071#ifdef CONFIG_SMP
2072 .set_affinity = dmar_msi_set_affinity,
2073#endif
2074 .retrigger = ioapic_retrigger_irq,
2075};
2076
2077int arch_setup_dmar_msi(unsigned int irq)
2078{
2079 int ret;
2080 struct msi_msg msg;
2081
2082 ret = msi_compose_msg(NULL, irq, &msg);
2083 if (ret < 0)
2084 return ret;
2085 dmar_msi_write(irq, &msg);
2086 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
2087 "edge");
2088 return 0;
2089}
2090#endif
2035 2091
2092#endif /* CONFIG_PCI_MSI */
2036/* 2093/*
2037 * Hypertransport interrupt support 2094 * Hypertransport interrupt support
2038 */ 2095 */
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index afaf9f12c03..393e2725a6e 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/dmar.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/iommu.h> 12#include <asm/iommu.h>
12#include <asm/calgary.h> 13#include <asm/calgary.h>
@@ -305,6 +306,8 @@ void __init pci_iommu_alloc(void)
305 detect_calgary(); 306 detect_calgary();
306#endif 307#endif
307 308
309 detect_intel_iommu();
310
308#ifdef CONFIG_SWIOTLB 311#ifdef CONFIG_SWIOTLB
309 pci_swiotlb_init(); 312 pci_swiotlb_init();
310#endif 313#endif
@@ -316,6 +319,8 @@ static int __init pci_iommu_init(void)
316 calgary_iommu_init(); 319 calgary_iommu_init();
317#endif 320#endif
318 321
322 intel_iommu_init();
323
319#ifdef CONFIG_IOMMU 324#ifdef CONFIG_IOMMU
320 gart_iommu_init(); 325 gart_iommu_init();
321#endif 326#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index ba2e165a8a0..cc0e91447b7 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -60,6 +60,7 @@
60#include <asm/vmi.h> 60#include <asm/vmi.h>
61#include <setup_arch.h> 61#include <setup_arch.h>
62#include <bios_ebda.h> 62#include <bios_ebda.h>
63#include <asm/cacheflush.h>
63 64
64/* This value is set up by the early boot code to point to the value 65/* This value is set up by the early boot code to point to the value
65 immediately after the boot time page tables. It contains a *physical* 66 immediately after the boot time page tables. It contains a *physical*
@@ -73,6 +74,7 @@ int disable_pse __devinitdata = 0;
73 */ 74 */
74extern struct resource code_resource; 75extern struct resource code_resource;
75extern struct resource data_resource; 76extern struct resource data_resource;
77extern struct resource bss_resource;
76 78
77/* cpu data as detected by the assembly code in head.S */ 79/* cpu data as detected by the assembly code in head.S */
78struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -600,6 +602,8 @@ void __init setup_arch(char **cmdline_p)
600 code_resource.end = virt_to_phys(_etext)-1; 602 code_resource.end = virt_to_phys(_etext)-1;
601 data_resource.start = virt_to_phys(_etext); 603 data_resource.start = virt_to_phys(_etext);
602 data_resource.end = virt_to_phys(_edata)-1; 604 data_resource.end = virt_to_phys(_edata)-1;
605 bss_resource.start = virt_to_phys(&__bss_start);
606 bss_resource.end = virt_to_phys(&__bss_stop)-1;
603 607
604 parse_early_param(); 608 parse_early_param();
605 609
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 31322d42eaa..e7a9e36bd52 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -58,6 +58,7 @@
58#include <asm/numa.h> 58#include <asm/numa.h>
59#include <asm/sections.h> 59#include <asm/sections.h>
60#include <asm/dmi.h> 60#include <asm/dmi.h>
61#include <asm/cacheflush.h>
61 62
62/* 63/*
63 * Machine setup.. 64 * Machine setup..
@@ -133,6 +134,12 @@ struct resource code_resource = {
133 .end = 0, 134 .end = 0,
134 .flags = IORESOURCE_RAM, 135 .flags = IORESOURCE_RAM,
135}; 136};
137struct resource bss_resource = {
138 .name = "Kernel bss",
139 .start = 0,
140 .end = 0,
141 .flags = IORESOURCE_RAM,
142};
136 143
137#ifdef CONFIG_PROC_VMCORE 144#ifdef CONFIG_PROC_VMCORE
138/* elfcorehdr= specifies the location of elf core header 145/* elfcorehdr= specifies the location of elf core header
@@ -276,6 +283,8 @@ void __init setup_arch(char **cmdline_p)
276 code_resource.end = virt_to_phys(&_etext)-1; 283 code_resource.end = virt_to_phys(&_etext)-1;
277 data_resource.start = virt_to_phys(&_etext); 284 data_resource.start = virt_to_phys(&_etext);
278 data_resource.end = virt_to_phys(&_edata)-1; 285 data_resource.end = virt_to_phys(&_edata)-1;
286 bss_resource.start = virt_to_phys(&__bss_start);
287 bss_resource.end = virt_to_phys(&__bss_stop)-1;
279 288
280 early_identify_cpu(&boot_cpu_data); 289 early_identify_cpu(&boot_cpu_data);
281 290
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
index c7b7dfe1d40..c40afbaaf93 100644
--- a/arch/x86/mm/pageattr_64.c
+++ b/arch/x86/mm/pageattr_64.c
@@ -61,10 +61,10 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
61 return base; 61 return base;
62} 62}
63 63
64static void cache_flush_page(void *adr) 64void clflush_cache_range(void *adr, int size)
65{ 65{
66 int i; 66 int i;
67 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 67 for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
68 clflush(adr+i); 68 clflush(adr+i);
69} 69}
70 70
@@ -80,7 +80,7 @@ static void flush_kernel_map(void *arg)
80 asm volatile("wbinvd" ::: "memory"); 80 asm volatile("wbinvd" ::: "memory");
81 else list_for_each_entry(pg, l, lru) { 81 else list_for_each_entry(pg, l, lru) {
82 void *adr = page_address(pg); 82 void *adr = page_address(pg);
83 cache_flush_page(adr); 83 clflush_cache_range(adr, PAGE_SIZE);
84 } 84 }
85 __flush_tlb_all(); 85 __flush_tlb_all();
86} 86}
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index aab25f3ba3c..c2d24991bb2 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -750,6 +750,38 @@ config PCI_DOMAINS
750 depends on PCI 750 depends on PCI
751 default y 751 default y
752 752
753config DMAR
754 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
755 depends on PCI_MSI && ACPI && EXPERIMENTAL
756 default y
757 help
758 DMA remapping (DMAR) devices support enables independent address
759 translations for Direct Memory Access (DMA) from devices.
760 These DMA remapping devices are reported via ACPI tables
761 and include PCI device scope covered by these DMA
762 remapping devices.
763
764config DMAR_GFX_WA
765 bool "Support for Graphics workaround"
766 depends on DMAR
767 default y
768 help
769 Current Graphics drivers tend to use physical address
770 for DMA and avoid using DMA APIs. Setting this config
771 option permits the IOMMU driver to set a unity map for
772 all the OS-visible memory. Hence the driver can continue
773 to use physical addresses for DMA.
774
775config DMAR_FLOPPY_WA
776 bool
777 depends on DMAR
778 default y
779 help
780 Floppy disk drivers are know to bypass DMA API calls
781 thereby failing to work when IOMMU is enabled. This
782 workaround will setup a 1:1 mapping for the first
783 16M to make floppy (an ISA device) work.
784
753source "drivers/pci/pcie/Kconfig" 785source "drivers/pci/pcie/Kconfig"
754 786
755source "drivers/pci/Kconfig" 787source "drivers/pci/Kconfig"
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c41d0728efe..7868707c7ed 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -137,7 +137,7 @@ static ssize_t show_mem_state(struct sys_device *dev, char *buf)
137 return len; 137 return len;
138} 138}
139 139
140static inline int memory_notify(unsigned long val, void *v) 140int memory_notify(unsigned long val, void *v)
141{ 141{
142 return blocking_notifier_call_chain(&memory_chain, val, v); 142 return blocking_notifier_call_chain(&memory_chain, val, v);
143} 143}
@@ -183,7 +183,6 @@ memory_block_action(struct memory_block *mem, unsigned long action)
183 break; 183 break;
184 case MEM_OFFLINE: 184 case MEM_OFFLINE:
185 mem->state = MEM_GOING_OFFLINE; 185 mem->state = MEM_GOING_OFFLINE;
186 memory_notify(MEM_GOING_OFFLINE, NULL);
187 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 186 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
188 ret = remove_memory(start_paddr, 187 ret = remove_memory(start_paddr,
189 PAGES_PER_SECTION << PAGE_SHIFT); 188 PAGES_PER_SECTION << PAGE_SHIFT);
@@ -191,7 +190,6 @@ memory_block_action(struct memory_block *mem, unsigned long action)
191 mem->state = old_state; 190 mem->state = old_state;
192 break; 191 break;
193 } 192 }
194 memory_notify(MEM_MAPPING_INVALID, NULL);
195 break; 193 break;
196 default: 194 default:
197 printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", 195 printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
@@ -199,11 +197,6 @@ memory_block_action(struct memory_block *mem, unsigned long action)
199 WARN_ON(1); 197 WARN_ON(1);
200 ret = -EINVAL; 198 ret = -EINVAL;
201 } 199 }
202 /*
203 * For now, only notify on successful memory operations
204 */
205 if (!ret)
206 memory_notify(action, NULL);
207 200
208 return ret; 201 return ret;
209} 202}
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 006054a4099..55505565073 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_PCI_MSI) += msi.o
20# Build the Hypertransport interrupt support 20# Build the Hypertransport interrupt support
21obj-$(CONFIG_HT_IRQ) += htirq.o 21obj-$(CONFIG_HT_IRQ) += htirq.o
22 22
23# Build Intel IOMMU support
24obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
25
23# 26#
24# Some architectures use the generic PCI setup functions 27# Some architectures use the generic PCI setup functions
25# 28#
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
new file mode 100644
index 00000000000..5dfdfdac92e
--- /dev/null
+++ b/drivers/pci/dmar.c
@@ -0,0 +1,329 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20 *
21 * This file implements early detection/parsing of DMA Remapping Devices
22 * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
23 * tables.
24 */
25
26#include <linux/pci.h>
27#include <linux/dmar.h>
28
29#undef PREFIX
30#define PREFIX "DMAR:"
31
32/* No locks are needed as DMA remapping hardware unit
33 * list is constructed at boot time and hotplug of
34 * these units are not supported by the architecture.
35 */
36LIST_HEAD(dmar_drhd_units);
37LIST_HEAD(dmar_rmrr_units);
38
39static struct acpi_table_header * __initdata dmar_tbl;
40
41static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
42{
43 /*
44 * add INCLUDE_ALL at the tail, so scan the list will find it at
45 * the very end.
46 */
47 if (drhd->include_all)
48 list_add_tail(&drhd->list, &dmar_drhd_units);
49 else
50 list_add(&drhd->list, &dmar_drhd_units);
51}
52
53static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
54{
55 list_add(&rmrr->list, &dmar_rmrr_units);
56}
57
58static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope,
59 struct pci_dev **dev, u16 segment)
60{
61 struct pci_bus *bus;
62 struct pci_dev *pdev = NULL;
63 struct acpi_dmar_pci_path *path;
64 int count;
65
66 bus = pci_find_bus(segment, scope->bus);
67 path = (struct acpi_dmar_pci_path *)(scope + 1);
68 count = (scope->length - sizeof(struct acpi_dmar_device_scope))
69 / sizeof(struct acpi_dmar_pci_path);
70
71 while (count) {
72 if (pdev)
73 pci_dev_put(pdev);
74 /*
75 * Some BIOSes list non-exist devices in DMAR table, just
76 * ignore it
77 */
78 if (!bus) {
79 printk(KERN_WARNING
80 PREFIX "Device scope bus [%d] not found\n",
81 scope->bus);
82 break;
83 }
84 pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn));
85 if (!pdev) {
86 printk(KERN_WARNING PREFIX
87 "Device scope device [%04x:%02x:%02x.%02x] not found\n",
88 segment, bus->number, path->dev, path->fn);
89 break;
90 }
91 path ++;
92 count --;
93 bus = pdev->subordinate;
94 }
95 if (!pdev) {
96 printk(KERN_WARNING PREFIX
97 "Device scope device [%04x:%02x:%02x.%02x] not found\n",
98 segment, scope->bus, path->dev, path->fn);
99 *dev = NULL;
100 return 0;
101 }
102 if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \
103 pdev->subordinate) || (scope->entry_type == \
104 ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) {
105 pci_dev_put(pdev);
106 printk(KERN_WARNING PREFIX
107 "Device scope type does not match for %s\n",
108 pci_name(pdev));
109 return -EINVAL;
110 }
111 *dev = pdev;
112 return 0;
113}
114
115static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt,
116 struct pci_dev ***devices, u16 segment)
117{
118 struct acpi_dmar_device_scope *scope;
119 void * tmp = start;
120 int index;
121 int ret;
122
123 *cnt = 0;
124 while (start < end) {
125 scope = start;
126 if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
127 scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
128 (*cnt)++;
129 else
130 printk(KERN_WARNING PREFIX
131 "Unsupported device scope\n");
132 start += scope->length;
133 }
134 if (*cnt == 0)
135 return 0;
136
137 *devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL);
138 if (!*devices)
139 return -ENOMEM;
140
141 start = tmp;
142 index = 0;
143 while (start < end) {
144 scope = start;
145 if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
146 scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) {
147 ret = dmar_parse_one_dev_scope(scope,
148 &(*devices)[index], segment);
149 if (ret) {
150 kfree(*devices);
151 return ret;
152 }
153 index ++;
154 }
155 start += scope->length;
156 }
157
158 return 0;
159}
160
161/**
162 * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
163 * structure which uniquely represent one DMA remapping hardware unit
164 * present in the platform
165 */
166static int __init
167dmar_parse_one_drhd(struct acpi_dmar_header *header)
168{
169 struct acpi_dmar_hardware_unit *drhd;
170 struct dmar_drhd_unit *dmaru;
171 int ret = 0;
172 static int include_all;
173
174 dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
175 if (!dmaru)
176 return -ENOMEM;
177
178 drhd = (struct acpi_dmar_hardware_unit *)header;
179 dmaru->reg_base_addr = drhd->address;
180 dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
181
182 if (!dmaru->include_all)
183 ret = dmar_parse_dev_scope((void *)(drhd + 1),
184 ((void *)drhd) + header->length,
185 &dmaru->devices_cnt, &dmaru->devices,
186 drhd->segment);
187 else {
188 /* Only allow one INCLUDE_ALL */
189 if (include_all) {
190 printk(KERN_WARNING PREFIX "Only one INCLUDE_ALL "
191 "device scope is allowed\n");
192 ret = -EINVAL;
193 }
194 include_all = 1;
195 }
196
197 if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all))
198 kfree(dmaru);
199 else
200 dmar_register_drhd_unit(dmaru);
201 return ret;
202}
203
204static int __init
205dmar_parse_one_rmrr(struct acpi_dmar_header *header)
206{
207 struct acpi_dmar_reserved_memory *rmrr;
208 struct dmar_rmrr_unit *rmrru;
209 int ret = 0;
210
211 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
212 if (!rmrru)
213 return -ENOMEM;
214
215 rmrr = (struct acpi_dmar_reserved_memory *)header;
216 rmrru->base_address = rmrr->base_address;
217 rmrru->end_address = rmrr->end_address;
218 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
219 ((void *)rmrr) + header->length,
220 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
221
222 if (ret || (rmrru->devices_cnt == 0))
223 kfree(rmrru);
224 else
225 dmar_register_rmrr_unit(rmrru);
226 return ret;
227}
228
229static void __init
230dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
231{
232 struct acpi_dmar_hardware_unit *drhd;
233 struct acpi_dmar_reserved_memory *rmrr;
234
235 switch (header->type) {
236 case ACPI_DMAR_TYPE_HARDWARE_UNIT:
237 drhd = (struct acpi_dmar_hardware_unit *)header;
238 printk (KERN_INFO PREFIX
239 "DRHD (flags: 0x%08x)base: 0x%016Lx\n",
240 drhd->flags, drhd->address);
241 break;
242 case ACPI_DMAR_TYPE_RESERVED_MEMORY:
243 rmrr = (struct acpi_dmar_reserved_memory *)header;
244
245 printk (KERN_INFO PREFIX
246 "RMRR base: 0x%016Lx end: 0x%016Lx\n",
247 rmrr->base_address, rmrr->end_address);
248 break;
249 }
250}
251
252/**
253 * parse_dmar_table - parses the DMA reporting table
254 */
255static int __init
256parse_dmar_table(void)
257{
258 struct acpi_table_dmar *dmar;
259 struct acpi_dmar_header *entry_header;
260 int ret = 0;
261
262 dmar = (struct acpi_table_dmar *)dmar_tbl;
263 if (!dmar)
264 return -ENODEV;
265
266 if (!dmar->width) {
267 printk (KERN_WARNING PREFIX "Zero: Invalid DMAR haw\n");
268 return -EINVAL;
269 }
270
271 printk (KERN_INFO PREFIX "Host address width %d\n",
272 dmar->width + 1);
273
274 entry_header = (struct acpi_dmar_header *)(dmar + 1);
275 while (((unsigned long)entry_header) <
276 (((unsigned long)dmar) + dmar_tbl->length)) {
277 dmar_table_print_dmar_entry(entry_header);
278
279 switch (entry_header->type) {
280 case ACPI_DMAR_TYPE_HARDWARE_UNIT:
281 ret = dmar_parse_one_drhd(entry_header);
282 break;
283 case ACPI_DMAR_TYPE_RESERVED_MEMORY:
284 ret = dmar_parse_one_rmrr(entry_header);
285 break;
286 default:
287 printk(KERN_WARNING PREFIX
288 "Unknown DMAR structure type\n");
289 ret = 0; /* for forward compatibility */
290 break;
291 }
292 if (ret)
293 break;
294
295 entry_header = ((void *)entry_header + entry_header->length);
296 }
297 return ret;
298}
299
300
301int __init dmar_table_init(void)
302{
303
304 parse_dmar_table();
305 if (list_empty(&dmar_drhd_units)) {
306 printk(KERN_INFO PREFIX "No DMAR devices found\n");
307 return -ENODEV;
308 }
309 return 0;
310}
311
312/**
313 * early_dmar_detect - checks to see if the platform supports DMAR devices
314 */
315int __init early_dmar_detect(void)
316{
317 acpi_status status = AE_OK;
318
319 /* if we could find DMAR table, then there are DMAR devices */
320 status = acpi_get_table(ACPI_SIG_DMAR, 0,
321 (struct acpi_table_header **)&dmar_tbl);
322
323 if (ACPI_SUCCESS(status) && !dmar_tbl) {
324 printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
325 status = AE_NOT_FOUND;
326 }
327
328 return (ACPI_SUCCESS(status) ? 1 : 0);
329}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
new file mode 100644
index 00000000000..b3d70310af4
--- /dev/null
+++ b/drivers/pci/intel-iommu.c
@@ -0,0 +1,2271 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20 */
21
22#include <linux/init.h>
23#include <linux/bitmap.h>
24#include <linux/slab.h>
25#include <linux/irq.h>
26#include <linux/interrupt.h>
27#include <linux/sysdev.h>
28#include <linux/spinlock.h>
29#include <linux/pci.h>
30#include <linux/dmar.h>
31#include <linux/dma-mapping.h>
32#include <linux/mempool.h>
33#include "iova.h"
34#include "intel-iommu.h"
35#include <asm/proto.h> /* force_iommu in this header in x86-64*/
36#include <asm/cacheflush.h>
37#include <asm/iommu.h>
38#include "pci.h"
39
40#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43#define IOAPIC_RANGE_START (0xfee00000)
44#define IOAPIC_RANGE_END (0xfeefffff)
45#define IOVA_START_ADDR (0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53static void domain_remove_dev_info(struct dmar_domain *domain);
54
55static int dmar_disabled;
56static int __initdata dmar_map_gfx = 1;
57static int dmar_forcedac;
58
59#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60static DEFINE_SPINLOCK(device_domain_lock);
61static LIST_HEAD(device_domain_list);
62
63static int __init intel_iommu_setup(char *str)
64{
65 if (!str)
66 return -EINVAL;
67 while (*str) {
68 if (!strncmp(str, "off", 3)) {
69 dmar_disabled = 1;
70 printk(KERN_INFO"Intel-IOMMU: disabled\n");
71 } else if (!strncmp(str, "igfx_off", 8)) {
72 dmar_map_gfx = 0;
73 printk(KERN_INFO
74 "Intel-IOMMU: disable GFX device mapping\n");
75 } else if (!strncmp(str, "forcedac", 8)) {
76 printk (KERN_INFO
77 "Intel-IOMMU: Forcing DAC for PCI devices\n");
78 dmar_forcedac = 1;
79 }
80
81 str += strcspn(str, ",");
82 while (*str == ',')
83 str++;
84 }
85 return 0;
86}
87__setup("intel_iommu=", intel_iommu_setup);
88
89static struct kmem_cache *iommu_domain_cache;
90static struct kmem_cache *iommu_devinfo_cache;
91static struct kmem_cache *iommu_iova_cache;
92
93static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
94{
95 unsigned int flags;
96 void *vaddr;
97
98 /* trying to avoid low memory issues */
99 flags = current->flags & PF_MEMALLOC;
100 current->flags |= PF_MEMALLOC;
101 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102 current->flags &= (~PF_MEMALLOC | flags);
103 return vaddr;
104}
105
106
107static inline void *alloc_pgtable_page(void)
108{
109 unsigned int flags;
110 void *vaddr;
111
112 /* trying to avoid low memory issues */
113 flags = current->flags & PF_MEMALLOC;
114 current->flags |= PF_MEMALLOC;
115 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116 current->flags &= (~PF_MEMALLOC | flags);
117 return vaddr;
118}
119
120static inline void free_pgtable_page(void *vaddr)
121{
122 free_page((unsigned long)vaddr);
123}
124
125static inline void *alloc_domain_mem(void)
126{
127 return iommu_kmem_cache_alloc(iommu_domain_cache);
128}
129
130static inline void free_domain_mem(void *vaddr)
131{
132 kmem_cache_free(iommu_domain_cache, vaddr);
133}
134
135static inline void * alloc_devinfo_mem(void)
136{
137 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
138}
139
140static inline void free_devinfo_mem(void *vaddr)
141{
142 kmem_cache_free(iommu_devinfo_cache, vaddr);
143}
144
145struct iova *alloc_iova_mem(void)
146{
147 return iommu_kmem_cache_alloc(iommu_iova_cache);
148}
149
150void free_iova_mem(struct iova *iova)
151{
152 kmem_cache_free(iommu_iova_cache, iova);
153}
154
155static inline void __iommu_flush_cache(
156 struct intel_iommu *iommu, void *addr, int size)
157{
158 if (!ecap_coherent(iommu->ecap))
159 clflush_cache_range(addr, size);
160}
161
162/* Gets context entry for a given bus and devfn */
163static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
164 u8 bus, u8 devfn)
165{
166 struct root_entry *root;
167 struct context_entry *context;
168 unsigned long phy_addr;
169 unsigned long flags;
170
171 spin_lock_irqsave(&iommu->lock, flags);
172 root = &iommu->root_entry[bus];
173 context = get_context_addr_from_root(root);
174 if (!context) {
175 context = (struct context_entry *)alloc_pgtable_page();
176 if (!context) {
177 spin_unlock_irqrestore(&iommu->lock, flags);
178 return NULL;
179 }
180 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181 phy_addr = virt_to_phys((void *)context);
182 set_root_value(root, phy_addr);
183 set_root_present(root);
184 __iommu_flush_cache(iommu, root, sizeof(*root));
185 }
186 spin_unlock_irqrestore(&iommu->lock, flags);
187 return &context[devfn];
188}
189
190static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
191{
192 struct root_entry *root;
193 struct context_entry *context;
194 int ret;
195 unsigned long flags;
196
197 spin_lock_irqsave(&iommu->lock, flags);
198 root = &iommu->root_entry[bus];
199 context = get_context_addr_from_root(root);
200 if (!context) {
201 ret = 0;
202 goto out;
203 }
204 ret = context_present(context[devfn]);
205out:
206 spin_unlock_irqrestore(&iommu->lock, flags);
207 return ret;
208}
209
210static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
211{
212 struct root_entry *root;
213 struct context_entry *context;
214 unsigned long flags;
215
216 spin_lock_irqsave(&iommu->lock, flags);
217 root = &iommu->root_entry[bus];
218 context = get_context_addr_from_root(root);
219 if (context) {
220 context_clear_entry(context[devfn]);
221 __iommu_flush_cache(iommu, &context[devfn], \
222 sizeof(*context));
223 }
224 spin_unlock_irqrestore(&iommu->lock, flags);
225}
226
227static void free_context_table(struct intel_iommu *iommu)
228{
229 struct root_entry *root;
230 int i;
231 unsigned long flags;
232 struct context_entry *context;
233
234 spin_lock_irqsave(&iommu->lock, flags);
235 if (!iommu->root_entry) {
236 goto out;
237 }
238 for (i = 0; i < ROOT_ENTRY_NR; i++) {
239 root = &iommu->root_entry[i];
240 context = get_context_addr_from_root(root);
241 if (context)
242 free_pgtable_page(context);
243 }
244 free_pgtable_page(iommu->root_entry);
245 iommu->root_entry = NULL;
246out:
247 spin_unlock_irqrestore(&iommu->lock, flags);
248}
249
250/* page table handling */
251#define LEVEL_STRIDE (9)
252#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
253
254static inline int agaw_to_level(int agaw)
255{
256 return agaw + 2;
257}
258
259static inline int agaw_to_width(int agaw)
260{
261 return 30 + agaw * LEVEL_STRIDE;
262
263}
264
265static inline int width_to_agaw(int width)
266{
267 return (width - 30) / LEVEL_STRIDE;
268}
269
270static inline unsigned int level_to_offset_bits(int level)
271{
272 return (12 + (level - 1) * LEVEL_STRIDE);
273}
274
275static inline int address_level_offset(u64 addr, int level)
276{
277 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
278}
279
280static inline u64 level_mask(int level)
281{
282 return ((u64)-1 << level_to_offset_bits(level));
283}
284
285static inline u64 level_size(int level)
286{
287 return ((u64)1 << level_to_offset_bits(level));
288}
289
290static inline u64 align_to_level(u64 addr, int level)
291{
292 return ((addr + level_size(level) - 1) & level_mask(level));
293}
294
295static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
296{
297 int addr_width = agaw_to_width(domain->agaw);
298 struct dma_pte *parent, *pte = NULL;
299 int level = agaw_to_level(domain->agaw);
300 int offset;
301 unsigned long flags;
302
303 BUG_ON(!domain->pgd);
304
305 addr &= (((u64)1) << addr_width) - 1;
306 parent = domain->pgd;
307
308 spin_lock_irqsave(&domain->mapping_lock, flags);
309 while (level > 0) {
310 void *tmp_page;
311
312 offset = address_level_offset(addr, level);
313 pte = &parent[offset];
314 if (level == 1)
315 break;
316
317 if (!dma_pte_present(*pte)) {
318 tmp_page = alloc_pgtable_page();
319
320 if (!tmp_page) {
321 spin_unlock_irqrestore(&domain->mapping_lock,
322 flags);
323 return NULL;
324 }
325 __iommu_flush_cache(domain->iommu, tmp_page,
326 PAGE_SIZE_4K);
327 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
328 /*
329 * high level table always sets r/w, last level page
330 * table control read/write
331 */
332 dma_set_pte_readable(*pte);
333 dma_set_pte_writable(*pte);
334 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
335 }
336 parent = phys_to_virt(dma_pte_addr(*pte));
337 level--;
338 }
339
340 spin_unlock_irqrestore(&domain->mapping_lock, flags);
341 return pte;
342}
343
344/* return address's pte at specific level */
345static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
346 int level)
347{
348 struct dma_pte *parent, *pte = NULL;
349 int total = agaw_to_level(domain->agaw);
350 int offset;
351
352 parent = domain->pgd;
353 while (level <= total) {
354 offset = address_level_offset(addr, total);
355 pte = &parent[offset];
356 if (level == total)
357 return pte;
358
359 if (!dma_pte_present(*pte))
360 break;
361 parent = phys_to_virt(dma_pte_addr(*pte));
362 total--;
363 }
364 return NULL;
365}
366
367/* clear one page's page table */
368static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
369{
370 struct dma_pte *pte = NULL;
371
372 /* get last level pte */
373 pte = dma_addr_level_pte(domain, addr, 1);
374
375 if (pte) {
376 dma_clear_pte(*pte);
377 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
378 }
379}
380
381/* clear last level pte, a tlb flush should be followed */
382static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
383{
384 int addr_width = agaw_to_width(domain->agaw);
385
386 start &= (((u64)1) << addr_width) - 1;
387 end &= (((u64)1) << addr_width) - 1;
388 /* in case it's partial page */
389 start = PAGE_ALIGN_4K(start);
390 end &= PAGE_MASK_4K;
391
392 /* we don't need lock here, nobody else touches the iova range */
393 while (start < end) {
394 dma_pte_clear_one(domain, start);
395 start += PAGE_SIZE_4K;
396 }
397}
398
399/* free page table pages. last level pte should already be cleared */
400static void dma_pte_free_pagetable(struct dmar_domain *domain,
401 u64 start, u64 end)
402{
403 int addr_width = agaw_to_width(domain->agaw);
404 struct dma_pte *pte;
405 int total = agaw_to_level(domain->agaw);
406 int level;
407 u64 tmp;
408
409 start &= (((u64)1) << addr_width) - 1;
410 end &= (((u64)1) << addr_width) - 1;
411
412 /* we don't need lock here, nobody else touches the iova range */
413 level = 2;
414 while (level <= total) {
415 tmp = align_to_level(start, level);
416 if (tmp >= end || (tmp + level_size(level) > end))
417 return;
418
419 while (tmp < end) {
420 pte = dma_addr_level_pte(domain, tmp, level);
421 if (pte) {
422 free_pgtable_page(
423 phys_to_virt(dma_pte_addr(*pte)));
424 dma_clear_pte(*pte);
425 __iommu_flush_cache(domain->iommu,
426 pte, sizeof(*pte));
427 }
428 tmp += level_size(level);
429 }
430 level++;
431 }
432 /* free pgd */
433 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434 free_pgtable_page(domain->pgd);
435 domain->pgd = NULL;
436 }
437}
438
439/* iommu handling */
440static int iommu_alloc_root_entry(struct intel_iommu *iommu)
441{
442 struct root_entry *root;
443 unsigned long flags;
444
445 root = (struct root_entry *)alloc_pgtable_page();
446 if (!root)
447 return -ENOMEM;
448
449 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
450
451 spin_lock_irqsave(&iommu->lock, flags);
452 iommu->root_entry = root;
453 spin_unlock_irqrestore(&iommu->lock, flags);
454
455 return 0;
456}
457
458#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
459{\
460 unsigned long start_time = jiffies;\
461 while (1) {\
462 sts = op (iommu->reg + offset);\
463 if (cond)\
464 break;\
465 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466 panic("DMAR hardware is malfunctioning\n");\
467 cpu_relax();\
468 }\
469}
470
471static void iommu_set_root_entry(struct intel_iommu *iommu)
472{
473 void *addr;
474 u32 cmd, sts;
475 unsigned long flag;
476
477 addr = iommu->root_entry;
478
479 spin_lock_irqsave(&iommu->register_lock, flag);
480 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
481
482 cmd = iommu->gcmd | DMA_GCMD_SRTP;
483 writel(cmd, iommu->reg + DMAR_GCMD_REG);
484
485 /* Make sure hardware complete it */
486 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487 readl, (sts & DMA_GSTS_RTPS), sts);
488
489 spin_unlock_irqrestore(&iommu->register_lock, flag);
490}
491
492static void iommu_flush_write_buffer(struct intel_iommu *iommu)
493{
494 u32 val;
495 unsigned long flag;
496
497 if (!cap_rwbf(iommu->cap))
498 return;
499 val = iommu->gcmd | DMA_GCMD_WBF;
500
501 spin_lock_irqsave(&iommu->register_lock, flag);
502 writel(val, iommu->reg + DMAR_GCMD_REG);
503
504 /* Make sure hardware complete it */
505 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506 readl, (!(val & DMA_GSTS_WBFS)), val);
507
508 spin_unlock_irqrestore(&iommu->register_lock, flag);
509}
510
511/* return value determine if we need a write buffer flush */
512static int __iommu_flush_context(struct intel_iommu *iommu,
513 u16 did, u16 source_id, u8 function_mask, u64 type,
514 int non_present_entry_flush)
515{
516 u64 val = 0;
517 unsigned long flag;
518
519 /*
520 * In the non-present entry flush case, if hardware doesn't cache
521 * non-present entry we do nothing and if hardware cache non-present
522 * entry, we flush entries of domain 0 (the domain id is used to cache
523 * any non-present entries)
524 */
525 if (non_present_entry_flush) {
526 if (!cap_caching_mode(iommu->cap))
527 return 1;
528 else
529 did = 0;
530 }
531
532 switch (type) {
533 case DMA_CCMD_GLOBAL_INVL:
534 val = DMA_CCMD_GLOBAL_INVL;
535 break;
536 case DMA_CCMD_DOMAIN_INVL:
537 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
538 break;
539 case DMA_CCMD_DEVICE_INVL:
540 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
542 break;
543 default:
544 BUG();
545 }
546 val |= DMA_CCMD_ICC;
547
548 spin_lock_irqsave(&iommu->register_lock, flag);
549 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
550
551 /* Make sure hardware complete it */
552 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
554
555 spin_unlock_irqrestore(&iommu->register_lock, flag);
556
557 /* flush context entry will implictly flush write buffer */
558 return 0;
559}
560
561static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562 int non_present_entry_flush)
563{
564 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565 non_present_entry_flush);
566}
567
568static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569 int non_present_entry_flush)
570{
571 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572 non_present_entry_flush);
573}
574
575static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
577{
578 return __iommu_flush_context(iommu, did, source_id, function_mask,
579 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
580}
581
582/* return value determine if we need a write buffer flush */
583static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584 u64 addr, unsigned int size_order, u64 type,
585 int non_present_entry_flush)
586{
587 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588 u64 val = 0, val_iva = 0;
589 unsigned long flag;
590
591 /*
592 * In the non-present entry flush case, if hardware doesn't cache
593 * non-present entry we do nothing and if hardware cache non-present
594 * entry, we flush entries of domain 0 (the domain id is used to cache
595 * any non-present entries)
596 */
597 if (non_present_entry_flush) {
598 if (!cap_caching_mode(iommu->cap))
599 return 1;
600 else
601 did = 0;
602 }
603
604 switch (type) {
605 case DMA_TLB_GLOBAL_FLUSH:
606 /* global flush doesn't need set IVA_REG */
607 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
608 break;
609 case DMA_TLB_DSI_FLUSH:
610 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
611 break;
612 case DMA_TLB_PSI_FLUSH:
613 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614 /* Note: always flush non-leaf currently */
615 val_iva = size_order | addr;
616 break;
617 default:
618 BUG();
619 }
620 /* Note: set drain read/write */
621#if 0
622 /*
623 * This is probably to be super secure.. Looks like we can
624 * ignore it without any impact.
625 */
626 if (cap_read_drain(iommu->cap))
627 val |= DMA_TLB_READ_DRAIN;
628#endif
629 if (cap_write_drain(iommu->cap))
630 val |= DMA_TLB_WRITE_DRAIN;
631
632 spin_lock_irqsave(&iommu->register_lock, flag);
633 /* Note: Only uses first TLB reg currently */
634 if (val_iva)
635 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636 dmar_writeq(iommu->reg + tlb_offset + 8, val);
637
638 /* Make sure hardware complete it */
639 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640 dmar_readq, (!(val & DMA_TLB_IVT)), val);
641
642 spin_unlock_irqrestore(&iommu->register_lock, flag);
643
644 /* check IOTLB invalidation granularity */
645 if (DMA_TLB_IAIG(val) == 0)
646 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650 /* flush context entry will implictly flush write buffer */
651 return 0;
652}
653
654static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655 int non_present_entry_flush)
656{
657 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658 non_present_entry_flush);
659}
660
661static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662 int non_present_entry_flush)
663{
664 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665 non_present_entry_flush);
666}
667
668static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
669 u64 addr, unsigned int pages, int non_present_entry_flush)
670{
671 unsigned int mask;
672
673 BUG_ON(addr & (~PAGE_MASK_4K));
674 BUG_ON(pages == 0);
675
676 /* Fallback to domain selective flush if no PSI support */
677 if (!cap_pgsel_inv(iommu->cap))
678 return iommu_flush_iotlb_dsi(iommu, did,
679 non_present_entry_flush);
680
681 /*
682 * PSI requires page size to be 2 ^ x, and the base address is naturally
683 * aligned to the size
684 */
685 mask = ilog2(__roundup_pow_of_two(pages));
686 /* Fallback to domain selective flush if size is too big */
687 if (mask > cap_max_amask_val(iommu->cap))
688 return iommu_flush_iotlb_dsi(iommu, did,
689 non_present_entry_flush);
690
691 return __iommu_flush_iotlb(iommu, did, addr, mask,
692 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
693}
694
695static int iommu_enable_translation(struct intel_iommu *iommu)
696{
697 u32 sts;
698 unsigned long flags;
699
700 spin_lock_irqsave(&iommu->register_lock, flags);
701 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
702
703 /* Make sure hardware complete it */
704 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
705 readl, (sts & DMA_GSTS_TES), sts);
706
707 iommu->gcmd |= DMA_GCMD_TE;
708 spin_unlock_irqrestore(&iommu->register_lock, flags);
709 return 0;
710}
711
712static int iommu_disable_translation(struct intel_iommu *iommu)
713{
714 u32 sts;
715 unsigned long flag;
716
717 spin_lock_irqsave(&iommu->register_lock, flag);
718 iommu->gcmd &= ~DMA_GCMD_TE;
719 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
720
721 /* Make sure hardware complete it */
722 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723 readl, (!(sts & DMA_GSTS_TES)), sts);
724
725 spin_unlock_irqrestore(&iommu->register_lock, flag);
726 return 0;
727}
728
729/* iommu interrupt handling. Most stuff are MSI-like. */
730
731static char *fault_reason_strings[] =
732{
733 "Software",
734 "Present bit in root entry is clear",
735 "Present bit in context entry is clear",
736 "Invalid context entry",
737 "Access beyond MGAW",
738 "PTE Write access is not set",
739 "PTE Read access is not set",
740 "Next page table ptr is invalid",
741 "Root table address invalid",
742 "Context table ptr is invalid",
743 "non-zero reserved fields in RTP",
744 "non-zero reserved fields in CTP",
745 "non-zero reserved fields in PTE",
746 "Unknown"
747};
748#define MAX_FAULT_REASON_IDX ARRAY_SIZE(fault_reason_strings)
749
750char *dmar_get_fault_reason(u8 fault_reason)
751{
752 if (fault_reason > MAX_FAULT_REASON_IDX)
753 return fault_reason_strings[MAX_FAULT_REASON_IDX];
754 else
755 return fault_reason_strings[fault_reason];
756}
757
758void dmar_msi_unmask(unsigned int irq)
759{
760 struct intel_iommu *iommu = get_irq_data(irq);
761 unsigned long flag;
762
763 /* unmask it */
764 spin_lock_irqsave(&iommu->register_lock, flag);
765 writel(0, iommu->reg + DMAR_FECTL_REG);
766 /* Read a reg to force flush the post write */
767 readl(iommu->reg + DMAR_FECTL_REG);
768 spin_unlock_irqrestore(&iommu->register_lock, flag);
769}
770
771void dmar_msi_mask(unsigned int irq)
772{
773 unsigned long flag;
774 struct intel_iommu *iommu = get_irq_data(irq);
775
776 /* mask it */
777 spin_lock_irqsave(&iommu->register_lock, flag);
778 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
779 /* Read a reg to force flush the post write */
780 readl(iommu->reg + DMAR_FECTL_REG);
781 spin_unlock_irqrestore(&iommu->register_lock, flag);
782}
783
784void dmar_msi_write(int irq, struct msi_msg *msg)
785{
786 struct intel_iommu *iommu = get_irq_data(irq);
787 unsigned long flag;
788
789 spin_lock_irqsave(&iommu->register_lock, flag);
790 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
791 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
792 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
793 spin_unlock_irqrestore(&iommu->register_lock, flag);
794}
795
796void dmar_msi_read(int irq, struct msi_msg *msg)
797{
798 struct intel_iommu *iommu = get_irq_data(irq);
799 unsigned long flag;
800
801 spin_lock_irqsave(&iommu->register_lock, flag);
802 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
803 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
804 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
805 spin_unlock_irqrestore(&iommu->register_lock, flag);
806}
807
808static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
809 u8 fault_reason, u16 source_id, u64 addr)
810{
811 char *reason;
812
813 reason = dmar_get_fault_reason(fault_reason);
814
815 printk(KERN_ERR
816 "DMAR:[%s] Request device [%02x:%02x.%d] "
817 "fault addr %llx \n"
818 "DMAR:[fault reason %02d] %s\n",
819 (type ? "DMA Read" : "DMA Write"),
820 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
821 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
822 return 0;
823}
824
825#define PRIMARY_FAULT_REG_LEN (16)
826static irqreturn_t iommu_page_fault(int irq, void *dev_id)
827{
828 struct intel_iommu *iommu = dev_id;
829 int reg, fault_index;
830 u32 fault_status;
831 unsigned long flag;
832
833 spin_lock_irqsave(&iommu->register_lock, flag);
834 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
835
836 /* TBD: ignore advanced fault log currently */
837 if (!(fault_status & DMA_FSTS_PPF))
838 goto clear_overflow;
839
840 fault_index = dma_fsts_fault_record_index(fault_status);
841 reg = cap_fault_reg_offset(iommu->cap);
842 while (1) {
843 u8 fault_reason;
844 u16 source_id;
845 u64 guest_addr;
846 int type;
847 u32 data;
848
849 /* highest 32 bits */
850 data = readl(iommu->reg + reg +
851 fault_index * PRIMARY_FAULT_REG_LEN + 12);
852 if (!(data & DMA_FRCD_F))
853 break;
854
855 fault_reason = dma_frcd_fault_reason(data);
856 type = dma_frcd_type(data);
857
858 data = readl(iommu->reg + reg +
859 fault_index * PRIMARY_FAULT_REG_LEN + 8);
860 source_id = dma_frcd_source_id(data);
861
862 guest_addr = dmar_readq(iommu->reg + reg +
863 fault_index * PRIMARY_FAULT_REG_LEN);
864 guest_addr = dma_frcd_page_addr(guest_addr);
865 /* clear the fault */
866 writel(DMA_FRCD_F, iommu->reg + reg +
867 fault_index * PRIMARY_FAULT_REG_LEN + 12);
868
869 spin_unlock_irqrestore(&iommu->register_lock, flag);
870
871 iommu_page_fault_do_one(iommu, type, fault_reason,
872 source_id, guest_addr);
873
874 fault_index++;
875 if (fault_index > cap_num_fault_regs(iommu->cap))
876 fault_index = 0;
877 spin_lock_irqsave(&iommu->register_lock, flag);
878 }
879clear_overflow:
880 /* clear primary fault overflow */
881 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
882 if (fault_status & DMA_FSTS_PFO)
883 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
884
885 spin_unlock_irqrestore(&iommu->register_lock, flag);
886 return IRQ_HANDLED;
887}
888
889int dmar_set_interrupt(struct intel_iommu *iommu)
890{
891 int irq, ret;
892
893 irq = create_irq();
894 if (!irq) {
895 printk(KERN_ERR "IOMMU: no free vectors\n");
896 return -EINVAL;
897 }
898
899 set_irq_data(irq, iommu);
900 iommu->irq = irq;
901
902 ret = arch_setup_dmar_msi(irq);
903 if (ret) {
904 set_irq_data(irq, NULL);
905 iommu->irq = 0;
906 destroy_irq(irq);
907 return 0;
908 }
909
910 /* Force fault register is cleared */
911 iommu_page_fault(irq, iommu);
912
913 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
914 if (ret)
915 printk(KERN_ERR "IOMMU: can't request irq\n");
916 return ret;
917}
918
919static int iommu_init_domains(struct intel_iommu *iommu)
920{
921 unsigned long ndomains;
922 unsigned long nlongs;
923
924 ndomains = cap_ndoms(iommu->cap);
925 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
926 nlongs = BITS_TO_LONGS(ndomains);
927
928 /* TBD: there might be 64K domains,
929 * consider other allocation for future chip
930 */
931 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
932 if (!iommu->domain_ids) {
933 printk(KERN_ERR "Allocating domain id array failed\n");
934 return -ENOMEM;
935 }
936 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
937 GFP_KERNEL);
938 if (!iommu->domains) {
939 printk(KERN_ERR "Allocating domain array failed\n");
940 kfree(iommu->domain_ids);
941 return -ENOMEM;
942 }
943
944 /*
945 * if Caching mode is set, then invalid translations are tagged
946 * with domainid 0. Hence we need to pre-allocate it.
947 */
948 if (cap_caching_mode(iommu->cap))
949 set_bit(0, iommu->domain_ids);
950 return 0;
951}
952
953static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
954{
955 struct intel_iommu *iommu;
956 int ret;
957 int map_size;
958 u32 ver;
959
960 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
961 if (!iommu)
962 return NULL;
963 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
964 if (!iommu->reg) {
965 printk(KERN_ERR "IOMMU: can't map the region\n");
966 goto error;
967 }
968 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
969 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
970
971 /* the registers might be more than one page */
972 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
973 cap_max_fault_reg_offset(iommu->cap));
974 map_size = PAGE_ALIGN_4K(map_size);
975 if (map_size > PAGE_SIZE_4K) {
976 iounmap(iommu->reg);
977 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
978 if (!iommu->reg) {
979 printk(KERN_ERR "IOMMU: can't map the region\n");
980 goto error;
981 }
982 }
983
984 ver = readl(iommu->reg + DMAR_VER_REG);
985 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
986 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
987 iommu->cap, iommu->ecap);
988 ret = iommu_init_domains(iommu);
989 if (ret)
990 goto error_unmap;
991 spin_lock_init(&iommu->lock);
992 spin_lock_init(&iommu->register_lock);
993
994 drhd->iommu = iommu;
995 return iommu;
996error_unmap:
997 iounmap(iommu->reg);
998 iommu->reg = 0;
999error:
1000 kfree(iommu);
1001 return NULL;
1002}
1003
1004static void domain_exit(struct dmar_domain *domain);
1005static void free_iommu(struct intel_iommu *iommu)
1006{
1007 struct dmar_domain *domain;
1008 int i;
1009
1010 if (!iommu)
1011 return;
1012
1013 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1014 for (; i < cap_ndoms(iommu->cap); ) {
1015 domain = iommu->domains[i];
1016 clear_bit(i, iommu->domain_ids);
1017 domain_exit(domain);
1018 i = find_next_bit(iommu->domain_ids,
1019 cap_ndoms(iommu->cap), i+1);
1020 }
1021
1022 if (iommu->gcmd & DMA_GCMD_TE)
1023 iommu_disable_translation(iommu);
1024
1025 if (iommu->irq) {
1026 set_irq_data(iommu->irq, NULL);
1027 /* This will mask the irq */
1028 free_irq(iommu->irq, iommu);
1029 destroy_irq(iommu->irq);
1030 }
1031
1032 kfree(iommu->domains);
1033 kfree(iommu->domain_ids);
1034
1035 /* free context mapping */
1036 free_context_table(iommu);
1037
1038 if (iommu->reg)
1039 iounmap(iommu->reg);
1040 kfree(iommu);
1041}
1042
1043static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1044{
1045 unsigned long num;
1046 unsigned long ndomains;
1047 struct dmar_domain *domain;
1048 unsigned long flags;
1049
1050 domain = alloc_domain_mem();
1051 if (!domain)
1052 return NULL;
1053
1054 ndomains = cap_ndoms(iommu->cap);
1055
1056 spin_lock_irqsave(&iommu->lock, flags);
1057 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1058 if (num >= ndomains) {
1059 spin_unlock_irqrestore(&iommu->lock, flags);
1060 free_domain_mem(domain);
1061 printk(KERN_ERR "IOMMU: no free domain ids\n");
1062 return NULL;
1063 }
1064
1065 set_bit(num, iommu->domain_ids);
1066 domain->id = num;
1067 domain->iommu = iommu;
1068 iommu->domains[num] = domain;
1069 spin_unlock_irqrestore(&iommu->lock, flags);
1070
1071 return domain;
1072}
1073
1074static void iommu_free_domain(struct dmar_domain *domain)
1075{
1076 unsigned long flags;
1077
1078 spin_lock_irqsave(&domain->iommu->lock, flags);
1079 clear_bit(domain->id, domain->iommu->domain_ids);
1080 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1081}
1082
1083static struct iova_domain reserved_iova_list;
1084
1085static void dmar_init_reserved_ranges(void)
1086{
1087 struct pci_dev *pdev = NULL;
1088 struct iova *iova;
1089 int i;
1090 u64 addr, size;
1091
1092 init_iova_domain(&reserved_iova_list);
1093
1094 /* IOAPIC ranges shouldn't be accessed by DMA */
1095 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096 IOVA_PFN(IOAPIC_RANGE_END));
1097 if (!iova)
1098 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1099
1100 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101 for_each_pci_dev(pdev) {
1102 struct resource *r;
1103
1104 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105 r = &pdev->resource[i];
1106 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1107 continue;
1108 addr = r->start;
1109 addr &= PAGE_MASK_4K;
1110 size = r->end - addr;
1111 size = PAGE_ALIGN_4K(size);
1112 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113 IOVA_PFN(size + addr) - 1);
1114 if (!iova)
1115 printk(KERN_ERR "Reserve iova failed\n");
1116 }
1117 }
1118
1119}
1120
1121static void domain_reserve_special_ranges(struct dmar_domain *domain)
1122{
1123 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1124}
1125
1126static inline int guestwidth_to_adjustwidth(int gaw)
1127{
1128 int agaw;
1129 int r = (gaw - 12) % 9;
1130
1131 if (r == 0)
1132 agaw = gaw;
1133 else
1134 agaw = gaw + 9 - r;
1135 if (agaw > 64)
1136 agaw = 64;
1137 return agaw;
1138}
1139
1140static int domain_init(struct dmar_domain *domain, int guest_width)
1141{
1142 struct intel_iommu *iommu;
1143 int adjust_width, agaw;
1144 unsigned long sagaw;
1145
1146 init_iova_domain(&domain->iovad);
1147 spin_lock_init(&domain->mapping_lock);
1148
1149 domain_reserve_special_ranges(domain);
1150
1151 /* calculate AGAW */
1152 iommu = domain->iommu;
1153 if (guest_width > cap_mgaw(iommu->cap))
1154 guest_width = cap_mgaw(iommu->cap);
1155 domain->gaw = guest_width;
1156 adjust_width = guestwidth_to_adjustwidth(guest_width);
1157 agaw = width_to_agaw(adjust_width);
1158 sagaw = cap_sagaw(iommu->cap);
1159 if (!test_bit(agaw, &sagaw)) {
1160 /* hardware doesn't support it, choose a bigger one */
1161 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162 agaw = find_next_bit(&sagaw, 5, agaw);
1163 if (agaw >= 5)
1164 return -ENODEV;
1165 }
1166 domain->agaw = agaw;
1167 INIT_LIST_HEAD(&domain->devices);
1168
1169 /* always allocate the top pgd */
1170 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1171 if (!domain->pgd)
1172 return -ENOMEM;
1173 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1174 return 0;
1175}
1176
1177static void domain_exit(struct dmar_domain *domain)
1178{
1179 u64 end;
1180
1181 /* Domain 0 is reserved, so dont process it */
1182 if (!domain)
1183 return;
1184
1185 domain_remove_dev_info(domain);
1186 /* destroy iovas */
1187 put_iova_domain(&domain->iovad);
1188 end = DOMAIN_MAX_ADDR(domain->gaw);
1189 end = end & (~PAGE_MASK_4K);
1190
1191 /* clear ptes */
1192 dma_pte_clear_range(domain, 0, end);
1193
1194 /* free page tables */
1195 dma_pte_free_pagetable(domain, 0, end);
1196
1197 iommu_free_domain(domain);
1198 free_domain_mem(domain);
1199}
1200
1201static int domain_context_mapping_one(struct dmar_domain *domain,
1202 u8 bus, u8 devfn)
1203{
1204 struct context_entry *context;
1205 struct intel_iommu *iommu = domain->iommu;
1206 unsigned long flags;
1207
1208 pr_debug("Set context mapping for %02x:%02x.%d\n",
1209 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210 BUG_ON(!domain->pgd);
1211 context = device_to_context_entry(iommu, bus, devfn);
1212 if (!context)
1213 return -ENOMEM;
1214 spin_lock_irqsave(&iommu->lock, flags);
1215 if (context_present(*context)) {
1216 spin_unlock_irqrestore(&iommu->lock, flags);
1217 return 0;
1218 }
1219
1220 context_set_domain_id(*context, domain->id);
1221 context_set_address_width(*context, domain->agaw);
1222 context_set_address_root(*context, virt_to_phys(domain->pgd));
1223 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224 context_set_fault_enable(*context);
1225 context_set_present(*context);
1226 __iommu_flush_cache(iommu, context, sizeof(*context));
1227
1228 /* it's a non-present to present mapping */
1229 if (iommu_flush_context_device(iommu, domain->id,
1230 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231 iommu_flush_write_buffer(iommu);
1232 else
1233 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234 spin_unlock_irqrestore(&iommu->lock, flags);
1235 return 0;
1236}
1237
1238static int
1239domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1240{
1241 int ret;
1242 struct pci_dev *tmp, *parent;
1243
1244 ret = domain_context_mapping_one(domain, pdev->bus->number,
1245 pdev->devfn);
1246 if (ret)
1247 return ret;
1248
1249 /* dependent device mapping */
1250 tmp = pci_find_upstream_pcie_bridge(pdev);
1251 if (!tmp)
1252 return 0;
1253 /* Secondary interface's bus number and devfn 0 */
1254 parent = pdev->bus->self;
1255 while (parent != tmp) {
1256 ret = domain_context_mapping_one(domain, parent->bus->number,
1257 parent->devfn);
1258 if (ret)
1259 return ret;
1260 parent = parent->bus->self;
1261 }
1262 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263 return domain_context_mapping_one(domain,
1264 tmp->subordinate->number, 0);
1265 else /* this is a legacy PCI bridge */
1266 return domain_context_mapping_one(domain,
1267 tmp->bus->number, tmp->devfn);
1268}
1269
1270static int domain_context_mapped(struct dmar_domain *domain,
1271 struct pci_dev *pdev)
1272{
1273 int ret;
1274 struct pci_dev *tmp, *parent;
1275
1276 ret = device_context_mapped(domain->iommu,
1277 pdev->bus->number, pdev->devfn);
1278 if (!ret)
1279 return ret;
1280 /* dependent device mapping */
1281 tmp = pci_find_upstream_pcie_bridge(pdev);
1282 if (!tmp)
1283 return ret;
1284 /* Secondary interface's bus number and devfn 0 */
1285 parent = pdev->bus->self;
1286 while (parent != tmp) {
1287 ret = device_context_mapped(domain->iommu, parent->bus->number,
1288 parent->devfn);
1289 if (!ret)
1290 return ret;
1291 parent = parent->bus->self;
1292 }
1293 if (tmp->is_pcie)
1294 return device_context_mapped(domain->iommu,
1295 tmp->subordinate->number, 0);
1296 else
1297 return device_context_mapped(domain->iommu,
1298 tmp->bus->number, tmp->devfn);
1299}
1300
1301static int
1302domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303 u64 hpa, size_t size, int prot)
1304{
1305 u64 start_pfn, end_pfn;
1306 struct dma_pte *pte;
1307 int index;
1308
1309 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1310 return -EINVAL;
1311 iova &= PAGE_MASK_4K;
1312 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1314 index = 0;
1315 while (start_pfn < end_pfn) {
1316 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1317 if (!pte)
1318 return -ENOMEM;
1319 /* We don't need lock here, nobody else
1320 * touches the iova range
1321 */
1322 BUG_ON(dma_pte_addr(*pte));
1323 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324 dma_set_pte_prot(*pte, prot);
1325 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1326 start_pfn++;
1327 index++;
1328 }
1329 return 0;
1330}
1331
1332static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1333{
1334 clear_context_table(domain->iommu, bus, devfn);
1335 iommu_flush_context_global(domain->iommu, 0);
1336 iommu_flush_iotlb_global(domain->iommu, 0);
1337}
1338
1339static void domain_remove_dev_info(struct dmar_domain *domain)
1340{
1341 struct device_domain_info *info;
1342 unsigned long flags;
1343
1344 spin_lock_irqsave(&device_domain_lock, flags);
1345 while (!list_empty(&domain->devices)) {
1346 info = list_entry(domain->devices.next,
1347 struct device_domain_info, link);
1348 list_del(&info->link);
1349 list_del(&info->global);
1350 if (info->dev)
1351 info->dev->dev.archdata.iommu = NULL;
1352 spin_unlock_irqrestore(&device_domain_lock, flags);
1353
1354 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355 free_devinfo_mem(info);
1356
1357 spin_lock_irqsave(&device_domain_lock, flags);
1358 }
1359 spin_unlock_irqrestore(&device_domain_lock, flags);
1360}
1361
1362/*
1363 * find_domain
1364 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1365 */
1366struct dmar_domain *
1367find_domain(struct pci_dev *pdev)
1368{
1369 struct device_domain_info *info;
1370
1371 /* No lock here, assumes no domain exit in normal case */
1372 info = pdev->dev.archdata.iommu;
1373 if (info)
1374 return info->domain;
1375 return NULL;
1376}
1377
1378static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1379 struct pci_dev *dev)
1380{
1381 int index;
1382
1383 while (dev) {
1384 for (index = 0; index < cnt; index ++)
1385 if (dev == devices[index])
1386 return 1;
1387
1388 /* Check our parent */
1389 dev = dev->bus->self;
1390 }
1391
1392 return 0;
1393}
1394
1395static struct dmar_drhd_unit *
1396dmar_find_matched_drhd_unit(struct pci_dev *dev)
1397{
1398 struct dmar_drhd_unit *drhd = NULL;
1399
1400 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1401 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1402 drhd->devices_cnt, dev))
1403 return drhd;
1404 }
1405
1406 return NULL;
1407}
1408
1409/* domain is initialized */
1410static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1411{
1412 struct dmar_domain *domain, *found = NULL;
1413 struct intel_iommu *iommu;
1414 struct dmar_drhd_unit *drhd;
1415 struct device_domain_info *info, *tmp;
1416 struct pci_dev *dev_tmp;
1417 unsigned long flags;
1418 int bus = 0, devfn = 0;
1419
1420 domain = find_domain(pdev);
1421 if (domain)
1422 return domain;
1423
1424 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1425 if (dev_tmp) {
1426 if (dev_tmp->is_pcie) {
1427 bus = dev_tmp->subordinate->number;
1428 devfn = 0;
1429 } else {
1430 bus = dev_tmp->bus->number;
1431 devfn = dev_tmp->devfn;
1432 }
1433 spin_lock_irqsave(&device_domain_lock, flags);
1434 list_for_each_entry(info, &device_domain_list, global) {
1435 if (info->bus == bus && info->devfn == devfn) {
1436 found = info->domain;
1437 break;
1438 }
1439 }
1440 spin_unlock_irqrestore(&device_domain_lock, flags);
1441 /* pcie-pci bridge already has a domain, uses it */
1442 if (found) {
1443 domain = found;
1444 goto found_domain;
1445 }
1446 }
1447
1448 /* Allocate new domain for the device */
1449 drhd = dmar_find_matched_drhd_unit(pdev);
1450 if (!drhd) {
1451 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1452 pci_name(pdev));
1453 return NULL;
1454 }
1455 iommu = drhd->iommu;
1456
1457 domain = iommu_alloc_domain(iommu);
1458 if (!domain)
1459 goto error;
1460
1461 if (domain_init(domain, gaw)) {
1462 domain_exit(domain);
1463 goto error;
1464 }
1465
1466 /* register pcie-to-pci device */
1467 if (dev_tmp) {
1468 info = alloc_devinfo_mem();
1469 if (!info) {
1470 domain_exit(domain);
1471 goto error;
1472 }
1473 info->bus = bus;
1474 info->devfn = devfn;
1475 info->dev = NULL;
1476 info->domain = domain;
1477 /* This domain is shared by devices under p2p bridge */
1478 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1479
1480 /* pcie-to-pci bridge already has a domain, uses it */
1481 found = NULL;
1482 spin_lock_irqsave(&device_domain_lock, flags);
1483 list_for_each_entry(tmp, &device_domain_list, global) {
1484 if (tmp->bus == bus && tmp->devfn == devfn) {
1485 found = tmp->domain;
1486 break;
1487 }
1488 }
1489 if (found) {
1490 free_devinfo_mem(info);
1491 domain_exit(domain);
1492 domain = found;
1493 } else {
1494 list_add(&info->link, &domain->devices);
1495 list_add(&info->global, &device_domain_list);
1496 }
1497 spin_unlock_irqrestore(&device_domain_lock, flags);
1498 }
1499
1500found_domain:
1501 info = alloc_devinfo_mem();
1502 if (!info)
1503 goto error;
1504 info->bus = pdev->bus->number;
1505 info->devfn = pdev->devfn;
1506 info->dev = pdev;
1507 info->domain = domain;
1508 spin_lock_irqsave(&device_domain_lock, flags);
1509 /* somebody is fast */
1510 found = find_domain(pdev);
1511 if (found != NULL) {
1512 spin_unlock_irqrestore(&device_domain_lock, flags);
1513 if (found != domain) {
1514 domain_exit(domain);
1515 domain = found;
1516 }
1517 free_devinfo_mem(info);
1518 return domain;
1519 }
1520 list_add(&info->link, &domain->devices);
1521 list_add(&info->global, &device_domain_list);
1522 pdev->dev.archdata.iommu = info;
1523 spin_unlock_irqrestore(&device_domain_lock, flags);
1524 return domain;
1525error:
1526 /* recheck it here, maybe others set it */
1527 return find_domain(pdev);
1528}
1529
1530static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1531{
1532 struct dmar_domain *domain;
1533 unsigned long size;
1534 u64 base;
1535 int ret;
1536
1537 printk(KERN_INFO
1538 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1539 pci_name(pdev), start, end);
1540 /* page table init */
1541 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1542 if (!domain)
1543 return -ENOMEM;
1544
1545 /* The address might not be aligned */
1546 base = start & PAGE_MASK_4K;
1547 size = end - base;
1548 size = PAGE_ALIGN_4K(size);
1549 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1550 IOVA_PFN(base + size) - 1)) {
1551 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1552 ret = -ENOMEM;
1553 goto error;
1554 }
1555
1556 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1557 size, base, pci_name(pdev));
1558 /*
1559 * RMRR range might have overlap with physical memory range,
1560 * clear it first
1561 */
1562 dma_pte_clear_range(domain, base, base + size);
1563
1564 ret = domain_page_mapping(domain, base, base, size,
1565 DMA_PTE_READ|DMA_PTE_WRITE);
1566 if (ret)
1567 goto error;
1568
1569 /* context entry init */
1570 ret = domain_context_mapping(domain, pdev);
1571 if (!ret)
1572 return 0;
1573error:
1574 domain_exit(domain);
1575 return ret;
1576
1577}
1578
1579static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1580 struct pci_dev *pdev)
1581{
1582 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1583 return 0;
1584 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1585 rmrr->end_address + 1);
1586}
1587
1588#ifdef CONFIG_DMAR_GFX_WA
1589extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1590static void __init iommu_prepare_gfx_mapping(void)
1591{
1592 struct pci_dev *pdev = NULL;
1593 u64 base, size;
1594 int slot;
1595 int ret;
1596
1597 for_each_pci_dev(pdev) {
1598 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1599 !IS_GFX_DEVICE(pdev))
1600 continue;
1601 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1602 pci_name(pdev));
1603 slot = arch_get_ram_range(0, &base, &size);
1604 while (slot >= 0) {
1605 ret = iommu_prepare_identity_map(pdev,
1606 base, base + size);
1607 if (ret)
1608 goto error;
1609 slot = arch_get_ram_range(slot, &base, &size);
1610 }
1611 continue;
1612error:
1613 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1614 }
1615}
1616#endif
1617
1618#ifdef CONFIG_DMAR_FLOPPY_WA
1619static inline void iommu_prepare_isa(void)
1620{
1621 struct pci_dev *pdev;
1622 int ret;
1623
1624 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1625 if (!pdev)
1626 return;
1627
1628 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1629 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1630
1631 if (ret)
1632 printk("IOMMU: Failed to create 0-64M identity map, "
1633 "floppy might not work\n");
1634
1635}
1636#else
1637static inline void iommu_prepare_isa(void)
1638{
1639 return;
1640}
1641#endif /* !CONFIG_DMAR_FLPY_WA */
1642
1643int __init init_dmars(void)
1644{
1645 struct dmar_drhd_unit *drhd;
1646 struct dmar_rmrr_unit *rmrr;
1647 struct pci_dev *pdev;
1648 struct intel_iommu *iommu;
1649 int ret, unit = 0;
1650
1651 /*
1652 * for each drhd
1653 * allocate root
1654 * initialize and program root entry to not present
1655 * endfor
1656 */
1657 for_each_drhd_unit(drhd) {
1658 if (drhd->ignored)
1659 continue;
1660 iommu = alloc_iommu(drhd);
1661 if (!iommu) {
1662 ret = -ENOMEM;
1663 goto error;
1664 }
1665
1666 /*
1667 * TBD:
1668 * we could share the same root & context tables
1669 * amoung all IOMMU's. Need to Split it later.
1670 */
1671 ret = iommu_alloc_root_entry(iommu);
1672 if (ret) {
1673 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1674 goto error;
1675 }
1676 }
1677
1678 /*
1679 * For each rmrr
1680 * for each dev attached to rmrr
1681 * do
1682 * locate drhd for dev, alloc domain for dev
1683 * allocate free domain
1684 * allocate page table entries for rmrr
1685 * if context not allocated for bus
1686 * allocate and init context
1687 * set present in root table for this bus
1688 * init context with domain, translation etc
1689 * endfor
1690 * endfor
1691 */
1692 for_each_rmrr_units(rmrr) {
1693 int i;
1694 for (i = 0; i < rmrr->devices_cnt; i++) {
1695 pdev = rmrr->devices[i];
1696 /* some BIOS lists non-exist devices in DMAR table */
1697 if (!pdev)
1698 continue;
1699 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1700 if (ret)
1701 printk(KERN_ERR
1702 "IOMMU: mapping reserved region failed\n");
1703 }
1704 }
1705
1706 iommu_prepare_gfx_mapping();
1707
1708 iommu_prepare_isa();
1709
1710 /*
1711 * for each drhd
1712 * enable fault log
1713 * global invalidate context cache
1714 * global invalidate iotlb
1715 * enable translation
1716 */
1717 for_each_drhd_unit(drhd) {
1718 if (drhd->ignored)
1719 continue;
1720 iommu = drhd->iommu;
1721 sprintf (iommu->name, "dmar%d", unit++);
1722
1723 iommu_flush_write_buffer(iommu);
1724
1725 ret = dmar_set_interrupt(iommu);
1726 if (ret)
1727 goto error;
1728
1729 iommu_set_root_entry(iommu);
1730
1731 iommu_flush_context_global(iommu, 0);
1732 iommu_flush_iotlb_global(iommu, 0);
1733
1734 ret = iommu_enable_translation(iommu);
1735 if (ret)
1736 goto error;
1737 }
1738
1739 return 0;
1740error:
1741 for_each_drhd_unit(drhd) {
1742 if (drhd->ignored)
1743 continue;
1744 iommu = drhd->iommu;
1745 free_iommu(iommu);
1746 }
1747 return ret;
1748}
1749
1750static inline u64 aligned_size(u64 host_addr, size_t size)
1751{
1752 u64 addr;
1753 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1754 return PAGE_ALIGN_4K(addr);
1755}
1756
1757struct iova *
1758iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1759{
1760 struct iova *piova;
1761
1762 /* Make sure it's in range */
1763 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1764 if (!size || (IOVA_START_ADDR + size > end))
1765 return NULL;
1766
1767 piova = alloc_iova(&domain->iovad,
1768 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1769 return piova;
1770}
1771
1772static struct iova *
1773__intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1774 size_t size)
1775{
1776 struct pci_dev *pdev = to_pci_dev(dev);
1777 struct iova *iova = NULL;
1778
1779 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1780 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1781 } else {
1782 /*
1783 * First try to allocate an io virtual address in
1784 * DMA_32BIT_MASK and if that fails then try allocating
1785 * from higer range
1786 */
1787 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1788 if (!iova)
1789 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1790 }
1791
1792 if (!iova) {
1793 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1794 return NULL;
1795 }
1796
1797 return iova;
1798}
1799
1800static struct dmar_domain *
1801get_valid_domain_for_dev(struct pci_dev *pdev)
1802{
1803 struct dmar_domain *domain;
1804 int ret;
1805
1806 domain = get_domain_for_dev(pdev,
1807 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1808 if (!domain) {
1809 printk(KERN_ERR
1810 "Allocating domain for %s failed", pci_name(pdev));
1811 return 0;
1812 }
1813
1814 /* make sure context mapping is ok */
1815 if (unlikely(!domain_context_mapped(domain, pdev))) {
1816 ret = domain_context_mapping(domain, pdev);
1817 if (ret) {
1818 printk(KERN_ERR
1819 "Domain context map for %s failed",
1820 pci_name(pdev));
1821 return 0;
1822 }
1823 }
1824
1825 return domain;
1826}
1827
1828static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1829 size_t size, int dir)
1830{
1831 struct pci_dev *pdev = to_pci_dev(hwdev);
1832 int ret;
1833 struct dmar_domain *domain;
1834 unsigned long start_addr;
1835 struct iova *iova;
1836 int prot = 0;
1837
1838 BUG_ON(dir == DMA_NONE);
1839 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1840 return virt_to_bus(addr);
1841
1842 domain = get_valid_domain_for_dev(pdev);
1843 if (!domain)
1844 return 0;
1845
1846 addr = (void *)virt_to_phys(addr);
1847 size = aligned_size((u64)addr, size);
1848
1849 iova = __intel_alloc_iova(hwdev, domain, size);
1850 if (!iova)
1851 goto error;
1852
1853 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1854
1855 /*
1856 * Check if DMAR supports zero-length reads on write only
1857 * mappings..
1858 */
1859 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1860 !cap_zlr(domain->iommu->cap))
1861 prot |= DMA_PTE_READ;
1862 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1863 prot |= DMA_PTE_WRITE;
1864 /*
1865 * addr - (addr + size) might be partial page, we should map the whole
1866 * page. Note: if two part of one page are separately mapped, we
1867 * might have two guest_addr mapping to the same host addr, but this
1868 * is not a big problem
1869 */
1870 ret = domain_page_mapping(domain, start_addr,
1871 ((u64)addr) & PAGE_MASK_4K, size, prot);
1872 if (ret)
1873 goto error;
1874
1875 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1876 pci_name(pdev), size, (u64)addr,
1877 size, (u64)start_addr, dir);
1878
1879 /* it's a non-present to present mapping */
1880 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1881 start_addr, size >> PAGE_SHIFT_4K, 1);
1882 if (ret)
1883 iommu_flush_write_buffer(domain->iommu);
1884
1885 return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1886
1887error:
1888 if (iova)
1889 __free_iova(&domain->iovad, iova);
1890 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1891 pci_name(pdev), size, (u64)addr, dir);
1892 return 0;
1893}
1894
1895static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1896 size_t size, int dir)
1897{
1898 struct pci_dev *pdev = to_pci_dev(dev);
1899 struct dmar_domain *domain;
1900 unsigned long start_addr;
1901 struct iova *iova;
1902
1903 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1904 return;
1905 domain = find_domain(pdev);
1906 BUG_ON(!domain);
1907
1908 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1909 if (!iova)
1910 return;
1911
1912 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1913 size = aligned_size((u64)dev_addr, size);
1914
1915 pr_debug("Device %s unmapping: %lx@%llx\n",
1916 pci_name(pdev), size, (u64)start_addr);
1917
1918 /* clear the whole page */
1919 dma_pte_clear_range(domain, start_addr, start_addr + size);
1920 /* free page tables */
1921 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1922
1923 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1924 size >> PAGE_SHIFT_4K, 0))
1925 iommu_flush_write_buffer(domain->iommu);
1926
1927 /* free iova */
1928 __free_iova(&domain->iovad, iova);
1929}
1930
1931static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1932 dma_addr_t *dma_handle, gfp_t flags)
1933{
1934 void *vaddr;
1935 int order;
1936
1937 size = PAGE_ALIGN_4K(size);
1938 order = get_order(size);
1939 flags &= ~(GFP_DMA | GFP_DMA32);
1940
1941 vaddr = (void *)__get_free_pages(flags, order);
1942 if (!vaddr)
1943 return NULL;
1944 memset(vaddr, 0, size);
1945
1946 *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1947 if (*dma_handle)
1948 return vaddr;
1949 free_pages((unsigned long)vaddr, order);
1950 return NULL;
1951}
1952
1953static void intel_free_coherent(struct device *hwdev, size_t size,
1954 void *vaddr, dma_addr_t dma_handle)
1955{
1956 int order;
1957
1958 size = PAGE_ALIGN_4K(size);
1959 order = get_order(size);
1960
1961 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1962 free_pages((unsigned long)vaddr, order);
1963}
1964
1965#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
1966static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1967 int nelems, int dir)
1968{
1969 int i;
1970 struct pci_dev *pdev = to_pci_dev(hwdev);
1971 struct dmar_domain *domain;
1972 unsigned long start_addr;
1973 struct iova *iova;
1974 size_t size = 0;
1975 void *addr;
1976 struct scatterlist *sg;
1977
1978 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1979 return;
1980
1981 domain = find_domain(pdev);
1982
1983 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
1984 if (!iova)
1985 return;
1986 for_each_sg(sglist, sg, nelems, i) {
1987 addr = SG_ENT_VIRT_ADDRESS(sg);
1988 size += aligned_size((u64)addr, sg->length);
1989 }
1990
1991 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1992
1993 /* clear the whole page */
1994 dma_pte_clear_range(domain, start_addr, start_addr + size);
1995 /* free page tables */
1996 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1997
1998 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1999 size >> PAGE_SHIFT_4K, 0))
2000 iommu_flush_write_buffer(domain->iommu);
2001
2002 /* free iova */
2003 __free_iova(&domain->iovad, iova);
2004}
2005
2006static int intel_nontranslate_map_sg(struct device *hddev,
2007 struct scatterlist *sglist, int nelems, int dir)
2008{
2009 int i;
2010 struct scatterlist *sg;
2011
2012 for_each_sg(sglist, sg, nelems, i) {
2013 BUG_ON(!sg->page);
2014 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2015 sg->dma_length = sg->length;
2016 }
2017 return nelems;
2018}
2019
2020static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2021 int nelems, int dir)
2022{
2023 void *addr;
2024 int i;
2025 struct pci_dev *pdev = to_pci_dev(hwdev);
2026 struct dmar_domain *domain;
2027 size_t size = 0;
2028 int prot = 0;
2029 size_t offset = 0;
2030 struct iova *iova = NULL;
2031 int ret;
2032 struct scatterlist *sg;
2033 unsigned long start_addr;
2034
2035 BUG_ON(dir == DMA_NONE);
2036 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2037 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2038
2039 domain = get_valid_domain_for_dev(pdev);
2040 if (!domain)
2041 return 0;
2042
2043 for_each_sg(sglist, sg, nelems, i) {
2044 addr = SG_ENT_VIRT_ADDRESS(sg);
2045 addr = (void *)virt_to_phys(addr);
2046 size += aligned_size((u64)addr, sg->length);
2047 }
2048
2049 iova = __intel_alloc_iova(hwdev, domain, size);
2050 if (!iova) {
2051 sglist->dma_length = 0;
2052 return 0;
2053 }
2054
2055 /*
2056 * Check if DMAR supports zero-length reads on write only
2057 * mappings..
2058 */
2059 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2060 !cap_zlr(domain->iommu->cap))
2061 prot |= DMA_PTE_READ;
2062 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2063 prot |= DMA_PTE_WRITE;
2064
2065 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2066 offset = 0;
2067 for_each_sg(sglist, sg, nelems, i) {
2068 addr = SG_ENT_VIRT_ADDRESS(sg);
2069 addr = (void *)virt_to_phys(addr);
2070 size = aligned_size((u64)addr, sg->length);
2071 ret = domain_page_mapping(domain, start_addr + offset,
2072 ((u64)addr) & PAGE_MASK_4K,
2073 size, prot);
2074 if (ret) {
2075 /* clear the page */
2076 dma_pte_clear_range(domain, start_addr,
2077 start_addr + offset);
2078 /* free page tables */
2079 dma_pte_free_pagetable(domain, start_addr,
2080 start_addr + offset);
2081 /* free iova */
2082 __free_iova(&domain->iovad, iova);
2083 return 0;
2084 }
2085 sg->dma_address = start_addr + offset +
2086 ((u64)addr & (~PAGE_MASK_4K));
2087 sg->dma_length = sg->length;
2088 offset += size;
2089 }
2090
2091 /* it's a non-present to present mapping */
2092 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2093 start_addr, offset >> PAGE_SHIFT_4K, 1))
2094 iommu_flush_write_buffer(domain->iommu);
2095 return nelems;
2096}
2097
2098static struct dma_mapping_ops intel_dma_ops = {
2099 .alloc_coherent = intel_alloc_coherent,
2100 .free_coherent = intel_free_coherent,
2101 .map_single = intel_map_single,
2102 .unmap_single = intel_unmap_single,
2103 .map_sg = intel_map_sg,
2104 .unmap_sg = intel_unmap_sg,
2105};
2106
2107static inline int iommu_domain_cache_init(void)
2108{
2109 int ret = 0;
2110
2111 iommu_domain_cache = kmem_cache_create("iommu_domain",
2112 sizeof(struct dmar_domain),
2113 0,
2114 SLAB_HWCACHE_ALIGN,
2115
2116 NULL);
2117 if (!iommu_domain_cache) {
2118 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2119 ret = -ENOMEM;
2120 }
2121
2122 return ret;
2123}
2124
2125static inline int iommu_devinfo_cache_init(void)
2126{
2127 int ret = 0;
2128
2129 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2130 sizeof(struct device_domain_info),
2131 0,
2132 SLAB_HWCACHE_ALIGN,
2133
2134 NULL);
2135 if (!iommu_devinfo_cache) {
2136 printk(KERN_ERR "Couldn't create devinfo cache\n");
2137 ret = -ENOMEM;
2138 }
2139
2140 return ret;
2141}
2142
2143static inline int iommu_iova_cache_init(void)
2144{
2145 int ret = 0;
2146
2147 iommu_iova_cache = kmem_cache_create("iommu_iova",
2148 sizeof(struct iova),
2149 0,
2150 SLAB_HWCACHE_ALIGN,
2151
2152 NULL);
2153 if (!iommu_iova_cache) {
2154 printk(KERN_ERR "Couldn't create iova cache\n");
2155 ret = -ENOMEM;
2156 }
2157
2158 return ret;
2159}
2160
2161static int __init iommu_init_mempool(void)
2162{
2163 int ret;
2164 ret = iommu_iova_cache_init();
2165 if (ret)
2166 return ret;
2167
2168 ret = iommu_domain_cache_init();
2169 if (ret)
2170 goto domain_error;
2171
2172 ret = iommu_devinfo_cache_init();
2173 if (!ret)
2174 return ret;
2175
2176 kmem_cache_destroy(iommu_domain_cache);
2177domain_error:
2178 kmem_cache_destroy(iommu_iova_cache);
2179
2180 return -ENOMEM;
2181}
2182
2183static void __init iommu_exit_mempool(void)
2184{
2185 kmem_cache_destroy(iommu_devinfo_cache);
2186 kmem_cache_destroy(iommu_domain_cache);
2187 kmem_cache_destroy(iommu_iova_cache);
2188
2189}
2190
2191void __init detect_intel_iommu(void)
2192{
2193 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2194 return;
2195 if (early_dmar_detect()) {
2196 iommu_detected = 1;
2197 }
2198}
2199
2200static void __init init_no_remapping_devices(void)
2201{
2202 struct dmar_drhd_unit *drhd;
2203
2204 for_each_drhd_unit(drhd) {
2205 if (!drhd->include_all) {
2206 int i;
2207 for (i = 0; i < drhd->devices_cnt; i++)
2208 if (drhd->devices[i] != NULL)
2209 break;
2210 /* ignore DMAR unit if no pci devices exist */
2211 if (i == drhd->devices_cnt)
2212 drhd->ignored = 1;
2213 }
2214 }
2215
2216 if (dmar_map_gfx)
2217 return;
2218
2219 for_each_drhd_unit(drhd) {
2220 int i;
2221 if (drhd->ignored || drhd->include_all)
2222 continue;
2223
2224 for (i = 0; i < drhd->devices_cnt; i++)
2225 if (drhd->devices[i] &&
2226 !IS_GFX_DEVICE(drhd->devices[i]))
2227 break;
2228
2229 if (i < drhd->devices_cnt)
2230 continue;
2231
2232 /* bypass IOMMU if it is just for gfx devices */
2233 drhd->ignored = 1;
2234 for (i = 0; i < drhd->devices_cnt; i++) {
2235 if (!drhd->devices[i])
2236 continue;
2237 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2238 }
2239 }
2240}
2241
2242int __init intel_iommu_init(void)
2243{
2244 int ret = 0;
2245
2246 if (no_iommu || swiotlb || dmar_disabled)
2247 return -ENODEV;
2248
2249 if (dmar_table_init())
2250 return -ENODEV;
2251
2252 iommu_init_mempool();
2253 dmar_init_reserved_ranges();
2254
2255 init_no_remapping_devices();
2256
2257 ret = init_dmars();
2258 if (ret) {
2259 printk(KERN_ERR "IOMMU: dmar init failed\n");
2260 put_iova_domain(&reserved_iova_list);
2261 iommu_exit_mempool();
2262 return ret;
2263 }
2264 printk(KERN_INFO
2265 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2266
2267 force_iommu = 1;
2268 dma_ops = &intel_dma_ops;
2269 return 0;
2270}
2271
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
new file mode 100644
index 00000000000..ee88dd2400c
--- /dev/null
+++ b/drivers/pci/intel-iommu.h
@@ -0,0 +1,325 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
19 */
20
21#ifndef _INTEL_IOMMU_H_
22#define _INTEL_IOMMU_H_
23
24#include <linux/types.h>
25#include <linux/msi.h>
26#include "iova.h"
27#include <linux/io.h>
28
29/*
30 * Intel IOMMU register specification per version 1.0 public spec.
31 */
32
33#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */
34#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */
35#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */
36#define DMAR_GCMD_REG 0x18 /* Global command register */
37#define DMAR_GSTS_REG 0x1c /* Global status register */
38#define DMAR_RTADDR_REG 0x20 /* Root entry table */
39#define DMAR_CCMD_REG 0x28 /* Context command reg */
40#define DMAR_FSTS_REG 0x34 /* Fault Status register */
41#define DMAR_FECTL_REG 0x38 /* Fault control register */
42#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */
43#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */
44#define DMAR_FEUADDR_REG 0x44 /* Upper address register */
45#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */
46#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */
47#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */
48#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */
49#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */
50#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */
51
52#define OFFSET_STRIDE (9)
53/*
54#define dmar_readl(dmar, reg) readl(dmar + reg)
55#define dmar_readq(dmar, reg) ({ \
56 u32 lo, hi; \
57 lo = readl(dmar + reg); \
58 hi = readl(dmar + reg + 4); \
59 (((u64) hi) << 32) + lo; })
60*/
61static inline u64 dmar_readq(void *addr)
62{
63 u32 lo, hi;
64 lo = readl(addr);
65 hi = readl(addr + 4);
66 return (((u64) hi) << 32) + lo;
67}
68
69static inline void dmar_writeq(void __iomem *addr, u64 val)
70{
71 writel((u32)val, addr);
72 writel((u32)(val >> 32), addr + 4);
73}
74
75#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4)
76#define DMAR_VER_MINOR(v) ((v) & 0x0f)
77
78/*
79 * Decoding Capability Register
80 */
81#define cap_read_drain(c) (((c) >> 55) & 1)
82#define cap_write_drain(c) (((c) >> 54) & 1)
83#define cap_max_amask_val(c) (((c) >> 48) & 0x3f)
84#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1)
85#define cap_pgsel_inv(c) (((c) >> 39) & 1)
86
87#define cap_super_page_val(c) (((c) >> 34) & 0xf)
88#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \
89 * OFFSET_STRIDE) + 21)
90
91#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16)
92#define cap_max_fault_reg_offset(c) \
93 (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
94
95#define cap_zlr(c) (((c) >> 22) & 1)
96#define cap_isoch(c) (((c) >> 23) & 1)
97#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1)
98#define cap_sagaw(c) (((c) >> 8) & 0x1f)
99#define cap_caching_mode(c) (((c) >> 7) & 1)
100#define cap_phmr(c) (((c) >> 6) & 1)
101#define cap_plmr(c) (((c) >> 5) & 1)
102#define cap_rwbf(c) (((c) >> 4) & 1)
103#define cap_afl(c) (((c) >> 3) & 1)
104#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
105/*
106 * Extended Capability Register
107 */
108
109#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1)
110#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16)
111#define ecap_max_iotlb_offset(e) \
112 (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
113#define ecap_coherent(e) ((e) & 0x1)
114
115
116/* IOTLB_REG */
117#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
118#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
119#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
120#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
121#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
122#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
123#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
124#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32)
125#define DMA_TLB_IVT (((u64)1) << 63)
126#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
127#define DMA_TLB_MAX_SIZE (0x3f)
128
129/* GCMD_REG */
130#define DMA_GCMD_TE (((u32)1) << 31)
131#define DMA_GCMD_SRTP (((u32)1) << 30)
132#define DMA_GCMD_SFL (((u32)1) << 29)
133#define DMA_GCMD_EAFL (((u32)1) << 28)
134#define DMA_GCMD_WBF (((u32)1) << 27)
135
136/* GSTS_REG */
137#define DMA_GSTS_TES (((u32)1) << 31)
138#define DMA_GSTS_RTPS (((u32)1) << 30)
139#define DMA_GSTS_FLS (((u32)1) << 29)
140#define DMA_GSTS_AFLS (((u32)1) << 28)
141#define DMA_GSTS_WBFS (((u32)1) << 27)
142
143/* CCMD_REG */
144#define DMA_CCMD_ICC (((u64)1) << 63)
145#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
146#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
147#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
148#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
149#define DMA_CCMD_MASK_NOBIT 0
150#define DMA_CCMD_MASK_1BIT 1
151#define DMA_CCMD_MASK_2BIT 2
152#define DMA_CCMD_MASK_3BIT 3
153#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
154#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
155
156/* FECTL_REG */
157#define DMA_FECTL_IM (((u32)1) << 31)
158
159/* FSTS_REG */
160#define DMA_FSTS_PPF ((u32)2)
161#define DMA_FSTS_PFO ((u32)1)
162#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
163
164/* FRCD_REG, 32 bits access */
165#define DMA_FRCD_F (((u32)1) << 31)
166#define dma_frcd_type(d) ((d >> 30) & 1)
167#define dma_frcd_fault_reason(c) (c & 0xff)
168#define dma_frcd_source_id(c) (c & 0xffff)
169#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
170
171/*
172 * 0: Present
173 * 1-11: Reserved
174 * 12-63: Context Ptr (12 - (haw-1))
175 * 64-127: Reserved
176 */
177struct root_entry {
178 u64 val;
179 u64 rsvd1;
180};
181#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
182static inline bool root_present(struct root_entry *root)
183{
184 return (root->val & 1);
185}
186static inline void set_root_present(struct root_entry *root)
187{
188 root->val |= 1;
189}
190static inline void set_root_value(struct root_entry *root, unsigned long value)
191{
192 root->val |= value & PAGE_MASK_4K;
193}
194
195struct context_entry;
196static inline struct context_entry *
197get_context_addr_from_root(struct root_entry *root)
198{
199 return (struct context_entry *)
200 (root_present(root)?phys_to_virt(
201 root->val & PAGE_MASK_4K):
202 NULL);
203}
204
205/*
206 * low 64 bits:
207 * 0: present
208 * 1: fault processing disable
209 * 2-3: translation type
210 * 12-63: address space root
211 * high 64 bits:
212 * 0-2: address width
213 * 3-6: aval
214 * 8-23: domain id
215 */
216struct context_entry {
217 u64 lo;
218 u64 hi;
219};
220#define context_present(c) ((c).lo & 1)
221#define context_fault_disable(c) (((c).lo >> 1) & 1)
222#define context_translation_type(c) (((c).lo >> 2) & 3)
223#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
224#define context_address_width(c) ((c).hi & 7)
225#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
226
227#define context_set_present(c) do {(c).lo |= 1;} while (0)
228#define context_set_fault_enable(c) \
229 do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
230#define context_set_translation_type(c, val) \
231 do { \
232 (c).lo &= (((u64)-1) << 4) | 3; \
233 (c).lo |= ((val) & 3) << 2; \
234 } while (0)
235#define CONTEXT_TT_MULTI_LEVEL 0
236#define context_set_address_root(c, val) \
237 do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
238#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
239#define context_set_domain_id(c, val) \
240 do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
241#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
242
243/*
244 * 0: readable
245 * 1: writable
246 * 2-6: reserved
247 * 7: super page
248 * 8-11: available
249 * 12-63: Host physcial address
250 */
251struct dma_pte {
252 u64 val;
253};
254#define dma_clear_pte(p) do {(p).val = 0;} while (0)
255
256#define DMA_PTE_READ (1)
257#define DMA_PTE_WRITE (2)
258
259#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
260#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
261#define dma_set_pte_prot(p, prot) \
262 do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
263#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
264#define dma_set_pte_addr(p, addr) do {\
265 (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
266#define dma_pte_present(p) (((p).val & 3) != 0)
267
268struct intel_iommu;
269
270struct dmar_domain {
271 int id; /* domain id */
272 struct intel_iommu *iommu; /* back pointer to owning iommu */
273
274 struct list_head devices; /* all devices' list */
275 struct iova_domain iovad; /* iova's that belong to this domain */
276
277 struct dma_pte *pgd; /* virtual address */
278 spinlock_t mapping_lock; /* page table lock */
279 int gaw; /* max guest address width */
280
281 /* adjusted guest address width, 0 is level 2 30-bit */
282 int agaw;
283
284#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
285 int flags;
286};
287
288/* PCI domain-device relationship */
289struct device_domain_info {
290 struct list_head link; /* link to domain siblings */
291 struct list_head global; /* link to global list */
292 u8 bus; /* PCI bus numer */
293 u8 devfn; /* PCI devfn number */
294 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
295 struct dmar_domain *domain; /* pointer to domain */
296};
297
298extern int init_dmars(void);
299
300struct intel_iommu {
301 void __iomem *reg; /* Pointer to hardware regs, virtual addr */
302 u64 cap;
303 u64 ecap;
304 unsigned long *domain_ids; /* bitmap of domains */
305 struct dmar_domain **domains; /* ptr to domains */
306 int seg;
307 u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
308 spinlock_t lock; /* protect context, domain ids */
309 spinlock_t register_lock; /* protect register handling */
310 struct root_entry *root_entry; /* virtual address */
311
312 unsigned int irq;
313 unsigned char name[7]; /* Device Name */
314 struct msi_msg saved_msg;
315 struct sys_device sysdev;
316};
317
318#ifndef CONFIG_DMAR_GFX_WA
319static inline void iommu_prepare_gfx_mapping(void)
320{
321 return;
322}
323#endif /* !CONFIG_DMAR_GFX_WA */
324
325#endif
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
new file mode 100644
index 00000000000..a84571c2936
--- /dev/null
+++ b/drivers/pci/iova.c
@@ -0,0 +1,394 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This file is released under the GPLv2.
5 *
6 * Copyright (C) 2006 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
7 */
8
9#include "iova.h"
10
11void
12init_iova_domain(struct iova_domain *iovad)
13{
14 spin_lock_init(&iovad->iova_alloc_lock);
15 spin_lock_init(&iovad->iova_rbtree_lock);
16 iovad->rbroot = RB_ROOT;
17 iovad->cached32_node = NULL;
18
19}
20
21static struct rb_node *
22__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
23{
24 if ((*limit_pfn != DMA_32BIT_PFN) ||
25 (iovad->cached32_node == NULL))
26 return rb_last(&iovad->rbroot);
27 else {
28 struct rb_node *prev_node = rb_prev(iovad->cached32_node);
29 struct iova *curr_iova =
30 container_of(iovad->cached32_node, struct iova, node);
31 *limit_pfn = curr_iova->pfn_lo - 1;
32 return prev_node;
33 }
34}
35
36static void
37__cached_rbnode_insert_update(struct iova_domain *iovad,
38 unsigned long limit_pfn, struct iova *new)
39{
40 if (limit_pfn != DMA_32BIT_PFN)
41 return;
42 iovad->cached32_node = &new->node;
43}
44
45static void
46__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
47{
48 struct iova *cached_iova;
49 struct rb_node *curr;
50
51 if (!iovad->cached32_node)
52 return;
53 curr = iovad->cached32_node;
54 cached_iova = container_of(curr, struct iova, node);
55
56 if (free->pfn_lo >= cached_iova->pfn_lo)
57 iovad->cached32_node = rb_next(&free->node);
58}
59
60/* Computes the padding size required, to make the
61 * the start address naturally aligned on its size
62 */
63static int
64iova_get_pad_size(int size, unsigned int limit_pfn)
65{
66 unsigned int pad_size = 0;
67 unsigned int order = ilog2(size);
68
69 if (order)
70 pad_size = (limit_pfn + 1) % (1 << order);
71
72 return pad_size;
73}
74
75static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size,
76 unsigned long limit_pfn, struct iova *new, bool size_aligned)
77{
78 struct rb_node *curr = NULL;
79 unsigned long flags;
80 unsigned long saved_pfn;
81 unsigned int pad_size = 0;
82
83 /* Walk the tree backwards */
84 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
85 saved_pfn = limit_pfn;
86 curr = __get_cached_rbnode(iovad, &limit_pfn);
87 while (curr) {
88 struct iova *curr_iova = container_of(curr, struct iova, node);
89 if (limit_pfn < curr_iova->pfn_lo)
90 goto move_left;
91 else if (limit_pfn < curr_iova->pfn_hi)
92 goto adjust_limit_pfn;
93 else {
94 if (size_aligned)
95 pad_size = iova_get_pad_size(size, limit_pfn);
96 if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
97 break; /* found a free slot */
98 }
99adjust_limit_pfn:
100 limit_pfn = curr_iova->pfn_lo - 1;
101move_left:
102 curr = rb_prev(curr);
103 }
104
105 if (!curr) {
106 if (size_aligned)
107 pad_size = iova_get_pad_size(size, limit_pfn);
108 if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
109 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
110 return -ENOMEM;
111 }
112 }
113
114 /* pfn_lo will point to size aligned address if size_aligned is set */
115 new->pfn_lo = limit_pfn - (size + pad_size) + 1;
116 new->pfn_hi = new->pfn_lo + size - 1;
117
118 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
119 return 0;
120}
121
122static void
123iova_insert_rbtree(struct rb_root *root, struct iova *iova)
124{
125 struct rb_node **new = &(root->rb_node), *parent = NULL;
126 /* Figure out where to put new node */
127 while (*new) {
128 struct iova *this = container_of(*new, struct iova, node);
129 parent = *new;
130
131 if (iova->pfn_lo < this->pfn_lo)
132 new = &((*new)->rb_left);
133 else if (iova->pfn_lo > this->pfn_lo)
134 new = &((*new)->rb_right);
135 else
136 BUG(); /* this should not happen */
137 }
138 /* Add new node and rebalance tree. */
139 rb_link_node(&iova->node, parent, new);
140 rb_insert_color(&iova->node, root);
141}
142
143/**
144 * alloc_iova - allocates an iova
145 * @iovad - iova domain in question
146 * @size - size of page frames to allocate
147 * @limit_pfn - max limit address
148 * @size_aligned - set if size_aligned address range is required
149 * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
150 * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
151 * flag is set then the allocated address iova->pfn_lo will be naturally
152 * aligned on roundup_power_of_two(size).
153 */
154struct iova *
155alloc_iova(struct iova_domain *iovad, unsigned long size,
156 unsigned long limit_pfn,
157 bool size_aligned)
158{
159 unsigned long flags;
160 struct iova *new_iova;
161 int ret;
162
163 new_iova = alloc_iova_mem();
164 if (!new_iova)
165 return NULL;
166
167 /* If size aligned is set then round the size to
168 * to next power of two.
169 */
170 if (size_aligned)
171 size = __roundup_pow_of_two(size);
172
173 spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
174 ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova,
175 size_aligned);
176
177 if (ret) {
178 spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
179 free_iova_mem(new_iova);
180 return NULL;
181 }
182
183 /* Insert the new_iova into domain rbtree by holding writer lock */
184 spin_lock(&iovad->iova_rbtree_lock);
185 iova_insert_rbtree(&iovad->rbroot, new_iova);
186 __cached_rbnode_insert_update(iovad, limit_pfn, new_iova);
187 spin_unlock(&iovad->iova_rbtree_lock);
188
189 spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
190
191 return new_iova;
192}
193
194/**
195 * find_iova - find's an iova for a given pfn
196 * @iovad - iova domain in question.
197 * pfn - page frame number
198 * This function finds and returns an iova belonging to the
199 * given doamin which matches the given pfn.
200 */
201struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
202{
203 unsigned long flags;
204 struct rb_node *node;
205
206 /* Take the lock so that no other thread is manipulating the rbtree */
207 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
208 node = iovad->rbroot.rb_node;
209 while (node) {
210 struct iova *iova = container_of(node, struct iova, node);
211
212 /* If pfn falls within iova's range, return iova */
213 if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
214 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
215 /* We are not holding the lock while this iova
216 * is referenced by the caller as the same thread
217 * which called this function also calls __free_iova()
218 * and it is by desing that only one thread can possibly
219 * reference a particular iova and hence no conflict.
220 */
221 return iova;
222 }
223
224 if (pfn < iova->pfn_lo)
225 node = node->rb_left;
226 else if (pfn > iova->pfn_lo)
227 node = node->rb_right;
228 }
229
230 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
231 return NULL;
232}
233
234/**
235 * __free_iova - frees the given iova
236 * @iovad: iova domain in question.
237 * @iova: iova in question.
238 * Frees the given iova belonging to the giving domain
239 */
240void
241__free_iova(struct iova_domain *iovad, struct iova *iova)
242{
243 unsigned long flags;
244
245 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
246 __cached_rbnode_delete_update(iovad, iova);
247 rb_erase(&iova->node, &iovad->rbroot);
248 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
249 free_iova_mem(iova);
250}
251
252/**
253 * free_iova - finds and frees the iova for a given pfn
254 * @iovad: - iova domain in question.
255 * @pfn: - pfn that is allocated previously
256 * This functions finds an iova for a given pfn and then
257 * frees the iova from that domain.
258 */
259void
260free_iova(struct iova_domain *iovad, unsigned long pfn)
261{
262 struct iova *iova = find_iova(iovad, pfn);
263 if (iova)
264 __free_iova(iovad, iova);
265
266}
267
268/**
269 * put_iova_domain - destroys the iova doamin
270 * @iovad: - iova domain in question.
271 * All the iova's in that domain are destroyed.
272 */
273void put_iova_domain(struct iova_domain *iovad)
274{
275 struct rb_node *node;
276 unsigned long flags;
277
278 spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
279 node = rb_first(&iovad->rbroot);
280 while (node) {
281 struct iova *iova = container_of(node, struct iova, node);
282 rb_erase(node, &iovad->rbroot);
283 free_iova_mem(iova);
284 node = rb_first(&iovad->rbroot);
285 }
286 spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
287}
288
289static int
290__is_range_overlap(struct rb_node *node,
291 unsigned long pfn_lo, unsigned long pfn_hi)
292{
293 struct iova *iova = container_of(node, struct iova, node);
294
295 if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
296 return 1;
297 return 0;
298}
299
300static struct iova *
301__insert_new_range(struct iova_domain *iovad,
302 unsigned long pfn_lo, unsigned long pfn_hi)
303{
304 struct iova *iova;
305
306 iova = alloc_iova_mem();
307 if (!iova)
308 return iova;
309
310 iova->pfn_hi = pfn_hi;
311 iova->pfn_lo = pfn_lo;
312 iova_insert_rbtree(&iovad->rbroot, iova);
313 return iova;
314}
315
316static void
317__adjust_overlap_range(struct iova *iova,
318 unsigned long *pfn_lo, unsigned long *pfn_hi)
319{
320 if (*pfn_lo < iova->pfn_lo)
321 iova->pfn_lo = *pfn_lo;
322 if (*pfn_hi > iova->pfn_hi)
323 *pfn_lo = iova->pfn_hi + 1;
324}
325
326/**
327 * reserve_iova - reserves an iova in the given range
328 * @iovad: - iova domain pointer
329 * @pfn_lo: - lower page frame address
330 * @pfn_hi:- higher pfn adderss
331 * This function allocates reserves the address range from pfn_lo to pfn_hi so
332 * that this address is not dished out as part of alloc_iova.
333 */
334struct iova *
335reserve_iova(struct iova_domain *iovad,
336 unsigned long pfn_lo, unsigned long pfn_hi)
337{
338 struct rb_node *node;
339 unsigned long flags;
340 struct iova *iova;
341 unsigned int overlap = 0;
342
343 spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
344 spin_lock(&iovad->iova_rbtree_lock);
345 for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
346 if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
347 iova = container_of(node, struct iova, node);
348 __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
349 if ((pfn_lo >= iova->pfn_lo) &&
350 (pfn_hi <= iova->pfn_hi))
351 goto finish;
352 overlap = 1;
353
354 } else if (overlap)
355 break;
356 }
357
358 /* We are here either becasue this is the first reserver node
359 * or need to insert remaining non overlap addr range
360 */
361 iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
362finish:
363
364 spin_unlock(&iovad->iova_rbtree_lock);
365 spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
366 return iova;
367}
368
369/**
370 * copy_reserved_iova - copies the reserved between domains
371 * @from: - source doamin from where to copy
372 * @to: - destination domin where to copy
373 * This function copies reserved iova's from one doamin to
374 * other.
375 */
376void
377copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
378{
379 unsigned long flags;
380 struct rb_node *node;
381
382 spin_lock_irqsave(&from->iova_alloc_lock, flags);
383 spin_lock(&from->iova_rbtree_lock);
384 for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
385 struct iova *iova = container_of(node, struct iova, node);
386 struct iova *new_iova;
387 new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
388 if (!new_iova)
389 printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
390 iova->pfn_lo, iova->pfn_lo);
391 }
392 spin_unlock(&from->iova_rbtree_lock);
393 spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
394}
diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h
new file mode 100644
index 00000000000..ae3028d5a94
--- /dev/null
+++ b/drivers/pci/iova.h
@@ -0,0 +1,63 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This file is released under the GPLv2.
5 *
6 * Copyright (C) 2006 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
7 *
8 */
9
10#ifndef _IOVA_H_
11#define _IOVA_H_
12
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/rbtree.h>
16#include <linux/dma-mapping.h>
17
18/*
19 * We need a fixed PAGE_SIZE of 4K irrespective of
20 * arch PAGE_SIZE for IOMMU page tables.
21 */
22#define PAGE_SHIFT_4K (12)
23#define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K)
24#define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K)
25#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
26
27/* IO virtual address start page frame number */
28#define IOVA_START_PFN (1)
29
30#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT_4K)
31#define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
32#define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
33
34/* iova structure */
35struct iova {
36 struct rb_node node;
37 unsigned long pfn_hi; /* IOMMU dish out addr hi */
38 unsigned long pfn_lo; /* IOMMU dish out addr lo */
39};
40
41/* holds all the iova translations for a domain */
42struct iova_domain {
43 spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */
44 spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
45 struct rb_root rbroot; /* iova domain rbtree root */
46 struct rb_node *cached32_node; /* Save last alloced node */
47};
48
49struct iova *alloc_iova_mem(void);
50void free_iova_mem(struct iova *iova);
51void free_iova(struct iova_domain *iovad, unsigned long pfn);
52void __free_iova(struct iova_domain *iovad, struct iova *iova);
53struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
54 unsigned long limit_pfn,
55 bool size_aligned);
56struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
57 unsigned long pfn_hi);
58void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
59void init_iova_domain(struct iova_domain *iovad);
60struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
61void put_iova_domain(struct iova_domain *iovad);
62
63#endif
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6fda33de84e..fc87e14b50d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -90,3 +90,4 @@ pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
90 return NULL; 90 return NULL;
91} 91}
92 92
93struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 5db6b6690b5..463a5a9d583 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -837,6 +837,19 @@ static void pci_release_dev(struct device *dev)
837 kfree(pci_dev); 837 kfree(pci_dev);
838} 838}
839 839
840static void set_pcie_port_type(struct pci_dev *pdev)
841{
842 int pos;
843 u16 reg16;
844
845 pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
846 if (!pos)
847 return;
848 pdev->is_pcie = 1;
849 pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
850 pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
851}
852
840/** 853/**
841 * pci_cfg_space_size - get the configuration space size of the PCI device. 854 * pci_cfg_space_size - get the configuration space size of the PCI device.
842 * @dev: PCI device 855 * @dev: PCI device
@@ -951,6 +964,7 @@ pci_scan_device(struct pci_bus *bus, int devfn)
951 dev->device = (l >> 16) & 0xffff; 964 dev->device = (l >> 16) & 0xffff;
952 dev->cfg_size = pci_cfg_space_size(dev); 965 dev->cfg_size = pci_cfg_space_size(dev);
953 dev->error_state = pci_channel_io_normal; 966 dev->error_state = pci_channel_io_normal;
967 set_pcie_port_type(dev);
954 968
955 /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) 969 /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
956 set this higher, assuming the system even supports it. */ 970 set this higher, assuming the system even supports it. */
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index c6e79d01ce3..b001b5922e3 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -14,6 +14,40 @@
14#include "pci.h" 14#include "pci.h"
15 15
16DECLARE_RWSEM(pci_bus_sem); 16DECLARE_RWSEM(pci_bus_sem);
17/*
18 * find the upstream PCIE-to-PCI bridge of a PCI device
19 * if the device is PCIE, return NULL
20 * if the device isn't connected to a PCIE bridge (that is its parent is a
21 * legacy PCI bridge and the bridge is directly connected to bus 0), return its
22 * parent
23 */
24struct pci_dev *
25pci_find_upstream_pcie_bridge(struct pci_dev *pdev)
26{
27 struct pci_dev *tmp = NULL;
28
29 if (pdev->is_pcie)
30 return NULL;
31 while (1) {
32 if (!pdev->bus->self)
33 break;
34 pdev = pdev->bus->self;
35 /* a p2p bridge */
36 if (!pdev->is_pcie) {
37 tmp = pdev;
38 continue;
39 }
40 /* PCI device should connect to a PCIE bridge */
41 if (pdev->pcie_type != PCI_EXP_TYPE_PCI_BRIDGE) {
42 /* Busted hardware? */
43 WARN_ON_ONCE(1);
44 return NULL;
45 }
46 return pdev;
47 }
48
49 return tmp;
50}
17 51
18static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr) 52static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr)
19{ 53{
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0a3ee5a322b..5574ba3ab1f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -103,7 +103,7 @@ extern int cifs_ioctl(struct inode *inode, struct file *filep,
103 unsigned int command, unsigned long arg); 103 unsigned int command, unsigned long arg);
104 104
105#ifdef CONFIG_CIFS_EXPERIMENTAL 105#ifdef CONFIG_CIFS_EXPERIMENTAL
106extern struct export_operations cifs_export_ops; 106extern const struct export_operations cifs_export_ops;
107#endif /* EXPERIMENTAL */ 107#endif /* EXPERIMENTAL */
108 108
109#define CIFS_VERSION "1.51" 109#define CIFS_VERSION "1.51"
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index d614b91caec..75949d6a5f1 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -53,7 +53,7 @@ static struct dentry *cifs_get_parent(struct dentry *dentry)
53 return ERR_PTR(-EACCES); 53 return ERR_PTR(-EACCES);
54} 54}
55 55
56struct export_operations cifs_export_ops = { 56const struct export_operations cifs_export_ops = {
57 .get_parent = cifs_get_parent, 57 .get_parent = cifs_get_parent,
58/* Following five export operations are unneeded so far and can default: 58/* Following five export operations are unneeded so far and can default:
59 .get_dentry = 59 .get_dentry =
diff --git a/fs/dcache.c b/fs/dcache.c
index 2bb3f7ac683..d9ca1e5ceb9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1479,6 +1479,8 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1479 * dentry:internal, target:external. Steal target's 1479 * dentry:internal, target:external. Steal target's
1480 * storage and make target internal. 1480 * storage and make target internal.
1481 */ 1481 */
1482 memcpy(target->d_iname, dentry->d_name.name,
1483 dentry->d_name.len + 1);
1482 dentry->d_name.name = target->d_name.name; 1484 dentry->d_name.name = target->d_name.name;
1483 target->d_name.name = target->d_iname; 1485 target->d_name.name = target->d_iname;
1484 } 1486 }
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 5276b19423c..f7f407075be 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -10,6 +10,8 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/efs_fs.h> 11#include <linux/efs_fs.h>
12#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
13#include <linux/exportfs.h>
14
13 15
14static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) { 16static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) {
15 struct buffer_head *bh; 17 struct buffer_head *bh;
@@ -75,13 +77,10 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
75 return NULL; 77 return NULL;
76} 78}
77 79
78struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp) 80static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
81 u32 generation)
79{ 82{
80 __u32 *objp = vobjp;
81 unsigned long ino = objp[0];
82 __u32 generation = objp[1];
83 struct inode *inode; 83 struct inode *inode;
84 struct dentry *result;
85 84
86 if (ino == 0) 85 if (ino == 0)
87 return ERR_PTR(-ESTALE); 86 return ERR_PTR(-ESTALE);
@@ -91,20 +90,25 @@ struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp)
91 90
92 if (is_bad_inode(inode) || 91 if (is_bad_inode(inode) ||
93 (generation && inode->i_generation != generation)) { 92 (generation && inode->i_generation != generation)) {
94 result = ERR_PTR(-ESTALE); 93 iput(inode);
95 goto out_iput; 94 return ERR_PTR(-ESTALE);
96 } 95 }
97 96
98 result = d_alloc_anon(inode); 97 return inode;
99 if (!result) { 98}
100 result = ERR_PTR(-ENOMEM);
101 goto out_iput;
102 }
103 return result;
104 99
105 out_iput: 100struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
106 iput(inode); 101 int fh_len, int fh_type)
107 return result; 102{
103 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
104 efs_nfs_get_inode);
105}
106
107struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
108 int fh_len, int fh_type)
109{
110 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
111 efs_nfs_get_inode);
108} 112}
109 113
110struct dentry *efs_get_parent(struct dentry *child) 114struct dentry *efs_get_parent(struct dentry *child)
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 25d0326c5f1..c79bc627f10 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -113,8 +113,9 @@ static const struct super_operations efs_superblock_operations = {
113 .remount_fs = efs_remount, 113 .remount_fs = efs_remount,
114}; 114};
115 115
116static struct export_operations efs_export_ops = { 116static const struct export_operations efs_export_ops = {
117 .get_dentry = efs_get_dentry, 117 .fh_to_dentry = efs_fh_to_dentry,
118 .fh_to_parent = efs_fh_to_parent,
118 .get_parent = efs_get_parent, 119 .get_parent = efs_get_parent,
119}; 120};
120 121
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 8adb32a9387..109ab5e44ec 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -1,4 +1,13 @@
1 1/*
2 * Copyright (C) Neil Brown 2002
3 * Copyright (C) Christoph Hellwig 2007
4 *
5 * This file contains the code mapping from inodes to NFS file handles,
6 * and for mapping back from file handles to dentries.
7 *
8 * For details on why we do all the strange and hairy things in here
9 * take a look at Documentation/filesystems/Exporting.
10 */
2#include <linux/exportfs.h> 11#include <linux/exportfs.h>
3#include <linux/fs.h> 12#include <linux/fs.h>
4#include <linux/file.h> 13#include <linux/file.h>
@@ -9,32 +18,19 @@
9#define dprintk(fmt, args...) do{}while(0) 18#define dprintk(fmt, args...) do{}while(0)
10 19
11 20
12static int get_name(struct dentry *dentry, char *name, 21static int get_name(struct vfsmount *mnt, struct dentry *dentry, char *name,
13 struct dentry *child); 22 struct dentry *child);
14 23
15 24
16static struct dentry *exportfs_get_dentry(struct super_block *sb, void *obj) 25static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir,
26 char *name, struct dentry *child)
17{ 27{
18 struct dentry *result = ERR_PTR(-ESTALE); 28 const struct export_operations *nop = dir->d_sb->s_export_op;
19
20 if (sb->s_export_op->get_dentry) {
21 result = sb->s_export_op->get_dentry(sb, obj);
22 if (!result)
23 result = ERR_PTR(-ESTALE);
24 }
25
26 return result;
27}
28
29static int exportfs_get_name(struct dentry *dir, char *name,
30 struct dentry *child)
31{
32 struct export_operations *nop = dir->d_sb->s_export_op;
33 29
34 if (nop->get_name) 30 if (nop->get_name)
35 return nop->get_name(dir, name, child); 31 return nop->get_name(dir, name, child);
36 else 32 else
37 return get_name(dir, name, child); 33 return get_name(mnt, dir, name, child);
38} 34}
39 35
40/* 36/*
@@ -98,7 +94,7 @@ find_disconnected_root(struct dentry *dentry)
98 * It may already be, as the flag isn't always updated when connection happens. 94 * It may already be, as the flag isn't always updated when connection happens.
99 */ 95 */
100static int 96static int
101reconnect_path(struct super_block *sb, struct dentry *target_dir) 97reconnect_path(struct vfsmount *mnt, struct dentry *target_dir)
102{ 98{
103 char nbuf[NAME_MAX+1]; 99 char nbuf[NAME_MAX+1];
104 int noprogress = 0; 100 int noprogress = 0;
@@ -121,7 +117,7 @@ reconnect_path(struct super_block *sb, struct dentry *target_dir)
121 pd->d_flags &= ~DCACHE_DISCONNECTED; 117 pd->d_flags &= ~DCACHE_DISCONNECTED;
122 spin_unlock(&pd->d_lock); 118 spin_unlock(&pd->d_lock);
123 noprogress = 0; 119 noprogress = 0;
124 } else if (pd == sb->s_root) { 120 } else if (pd == mnt->mnt_sb->s_root) {
125 printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n"); 121 printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n");
126 spin_lock(&pd->d_lock); 122 spin_lock(&pd->d_lock);
127 pd->d_flags &= ~DCACHE_DISCONNECTED; 123 pd->d_flags &= ~DCACHE_DISCONNECTED;
@@ -147,8 +143,8 @@ reconnect_path(struct super_block *sb, struct dentry *target_dir)
147 struct dentry *npd; 143 struct dentry *npd;
148 144
149 mutex_lock(&pd->d_inode->i_mutex); 145 mutex_lock(&pd->d_inode->i_mutex);
150 if (sb->s_export_op->get_parent) 146 if (mnt->mnt_sb->s_export_op->get_parent)
151 ppd = sb->s_export_op->get_parent(pd); 147 ppd = mnt->mnt_sb->s_export_op->get_parent(pd);
152 mutex_unlock(&pd->d_inode->i_mutex); 148 mutex_unlock(&pd->d_inode->i_mutex);
153 149
154 if (IS_ERR(ppd)) { 150 if (IS_ERR(ppd)) {
@@ -161,7 +157,7 @@ reconnect_path(struct super_block *sb, struct dentry *target_dir)
161 157
162 dprintk("%s: find name of %lu in %lu\n", __FUNCTION__, 158 dprintk("%s: find name of %lu in %lu\n", __FUNCTION__,
163 pd->d_inode->i_ino, ppd->d_inode->i_ino); 159 pd->d_inode->i_ino, ppd->d_inode->i_ino);
164 err = exportfs_get_name(ppd, nbuf, pd); 160 err = exportfs_get_name(mnt, ppd, nbuf, pd);
165 if (err) { 161 if (err) {
166 dput(ppd); 162 dput(ppd);
167 dput(pd); 163 dput(pd);
@@ -214,125 +210,6 @@ reconnect_path(struct super_block *sb, struct dentry *target_dir)
214 return 0; 210 return 0;
215} 211}
216 212
217/**
218 * find_exported_dentry - helper routine to implement export_operations->decode_fh
219 * @sb: The &super_block identifying the filesystem
220 * @obj: An opaque identifier of the object to be found - passed to
221 * get_inode
222 * @parent: An optional opqaue identifier of the parent of the object.
223 * @acceptable: A function used to test possible &dentries to see if they are
224 * acceptable
225 * @context: A parameter to @acceptable so that it knows on what basis to
226 * judge.
227 *
228 * find_exported_dentry is the central helper routine to enable file systems
229 * to provide the decode_fh() export_operation. It's main task is to take
230 * an &inode, find or create an appropriate &dentry structure, and possibly
231 * splice this into the dcache in the correct place.
232 *
233 * The decode_fh() operation provided by the filesystem should call
234 * find_exported_dentry() with the same parameters that it received except
235 * that instead of the file handle fragment, pointers to opaque identifiers
236 * for the object and optionally its parent are passed. The default decode_fh
237 * routine passes one pointer to the start of the filehandle fragment, and
238 * one 8 bytes into the fragment. It is expected that most filesystems will
239 * take this approach, though the offset to the parent identifier may well be
240 * different.
241 *
242 * find_exported_dentry() will call get_dentry to get an dentry pointer from
243 * the file system. If any &dentry in the d_alias list is acceptable, it will
244 * be returned. Otherwise find_exported_dentry() will attempt to splice a new
245 * &dentry into the dcache using get_name() and get_parent() to find the
246 * appropriate place.
247 */
248
249struct dentry *
250find_exported_dentry(struct super_block *sb, void *obj, void *parent,
251 int (*acceptable)(void *context, struct dentry *de),
252 void *context)
253{
254 struct dentry *result, *alias;
255 int err = -ESTALE;
256
257 /*
258 * Attempt to find the inode.
259 */
260 result = exportfs_get_dentry(sb, obj);
261 if (IS_ERR(result))
262 return result;
263
264 if (S_ISDIR(result->d_inode->i_mode)) {
265 if (!(result->d_flags & DCACHE_DISCONNECTED)) {
266 if (acceptable(context, result))
267 return result;
268 err = -EACCES;
269 goto err_result;
270 }
271
272 err = reconnect_path(sb, result);
273 if (err)
274 goto err_result;
275 } else {
276 struct dentry *target_dir, *nresult;
277 char nbuf[NAME_MAX+1];
278
279 alias = find_acceptable_alias(result, acceptable, context);
280 if (alias)
281 return alias;
282
283 if (parent == NULL)
284 goto err_result;
285
286 target_dir = exportfs_get_dentry(sb,parent);
287 if (IS_ERR(target_dir)) {
288 err = PTR_ERR(target_dir);
289 goto err_result;
290 }
291
292 err = reconnect_path(sb, target_dir);
293 if (err) {
294 dput(target_dir);
295 goto err_result;
296 }
297
298 /*
299 * As we weren't after a directory, have one more step to go.
300 */
301 err = exportfs_get_name(target_dir, nbuf, result);
302 if (!err) {
303 mutex_lock(&target_dir->d_inode->i_mutex);
304 nresult = lookup_one_len(nbuf, target_dir,
305 strlen(nbuf));
306 mutex_unlock(&target_dir->d_inode->i_mutex);
307 if (!IS_ERR(nresult)) {
308 if (nresult->d_inode) {
309 dput(result);
310 result = nresult;
311 } else
312 dput(nresult);
313 }
314 }
315 dput(target_dir);
316 }
317
318 alias = find_acceptable_alias(result, acceptable, context);
319 if (alias)
320 return alias;
321
322 /* drat - I just cannot find anything acceptable */
323 dput(result);
324 /* It might be justifiable to return ESTALE here,
325 * but the filehandle at-least looks reasonable good
326 * and it may just be a permission problem, so returning
327 * -EACCESS is safer
328 */
329 return ERR_PTR(-EACCES);
330
331 err_result:
332 dput(result);
333 return ERR_PTR(err);
334}
335
336struct getdents_callback { 213struct getdents_callback {
337 char *name; /* name that was found. It already points to a 214 char *name; /* name that was found. It already points to a
338 buffer NAME_MAX+1 is size */ 215 buffer NAME_MAX+1 is size */
@@ -370,8 +247,8 @@ static int filldir_one(void * __buf, const char * name, int len,
370 * calls readdir on the parent until it finds an entry with 247 * calls readdir on the parent until it finds an entry with
371 * the same inode number as the child, and returns that. 248 * the same inode number as the child, and returns that.
372 */ 249 */
373static int get_name(struct dentry *dentry, char *name, 250static int get_name(struct vfsmount *mnt, struct dentry *dentry,
374 struct dentry *child) 251 char *name, struct dentry *child)
375{ 252{
376 struct inode *dir = dentry->d_inode; 253 struct inode *dir = dentry->d_inode;
377 int error; 254 int error;
@@ -387,7 +264,7 @@ static int get_name(struct dentry *dentry, char *name,
387 /* 264 /*
388 * Open the directory ... 265 * Open the directory ...
389 */ 266 */
390 file = dentry_open(dget(dentry), NULL, O_RDONLY); 267 file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY);
391 error = PTR_ERR(file); 268 error = PTR_ERR(file);
392 if (IS_ERR(file)) 269 if (IS_ERR(file))
393 goto out; 270 goto out;
@@ -434,100 +311,177 @@ out:
434 * can be used to check that it is still valid. It places them in the 311 * can be used to check that it is still valid. It places them in the
435 * filehandle fragment where export_decode_fh expects to find them. 312 * filehandle fragment where export_decode_fh expects to find them.
436 */ 313 */
437static int export_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, 314static int export_encode_fh(struct dentry *dentry, struct fid *fid,
438 int connectable) 315 int *max_len, int connectable)
439{ 316{
440 struct inode * inode = dentry->d_inode; 317 struct inode * inode = dentry->d_inode;
441 int len = *max_len; 318 int len = *max_len;
442 int type = 1; 319 int type = FILEID_INO32_GEN;
443 320
444 if (len < 2 || (connectable && len < 4)) 321 if (len < 2 || (connectable && len < 4))
445 return 255; 322 return 255;
446 323
447 len = 2; 324 len = 2;
448 fh[0] = inode->i_ino; 325 fid->i32.ino = inode->i_ino;
449 fh[1] = inode->i_generation; 326 fid->i32.gen = inode->i_generation;
450 if (connectable && !S_ISDIR(inode->i_mode)) { 327 if (connectable && !S_ISDIR(inode->i_mode)) {
451 struct inode *parent; 328 struct inode *parent;
452 329
453 spin_lock(&dentry->d_lock); 330 spin_lock(&dentry->d_lock);
454 parent = dentry->d_parent->d_inode; 331 parent = dentry->d_parent->d_inode;
455 fh[2] = parent->i_ino; 332 fid->i32.parent_ino = parent->i_ino;
456 fh[3] = parent->i_generation; 333 fid->i32.parent_gen = parent->i_generation;
457 spin_unlock(&dentry->d_lock); 334 spin_unlock(&dentry->d_lock);
458 len = 4; 335 len = 4;
459 type = 2; 336 type = FILEID_INO32_GEN_PARENT;
460 } 337 }
461 *max_len = len; 338 *max_len = len;
462 return type; 339 return type;
463} 340}
464 341
465 342int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
466/**
467 * export_decode_fh - default export_operations->decode_fh function
468 * @sb: The superblock
469 * @fh: pointer to the file handle fragment
470 * @fh_len: length of file handle fragment
471 * @acceptable: function for testing acceptability of dentrys
472 * @context: context for @acceptable
473 *
474 * This is the default decode_fh() function.
475 * a fileid_type of 1 indicates that the filehandlefragment
476 * just contains an object identifier understood by get_dentry.
477 * a fileid_type of 2 says that there is also a directory
478 * identifier 8 bytes in to the filehandlefragement.
479 */
480static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh_len,
481 int fileid_type,
482 int (*acceptable)(void *context, struct dentry *de),
483 void *context)
484{
485 __u32 parent[2];
486 parent[0] = parent[1] = 0;
487 if (fh_len < 2 || fileid_type > 2)
488 return NULL;
489 if (fileid_type == 2) {
490 if (fh_len > 2) parent[0] = fh[2];
491 if (fh_len > 3) parent[1] = fh[3];
492 }
493 return find_exported_dentry(sb, fh, parent,
494 acceptable, context);
495}
496
497int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
498 int connectable) 343 int connectable)
499{ 344{
500 struct export_operations *nop = dentry->d_sb->s_export_op; 345 const struct export_operations *nop = dentry->d_sb->s_export_op;
501 int error; 346 int error;
502 347
503 if (nop->encode_fh) 348 if (nop->encode_fh)
504 error = nop->encode_fh(dentry, fh, max_len, connectable); 349 error = nop->encode_fh(dentry, fid->raw, max_len, connectable);
505 else 350 else
506 error = export_encode_fh(dentry, fh, max_len, connectable); 351 error = export_encode_fh(dentry, fid, max_len, connectable);
507 352
508 return error; 353 return error;
509} 354}
510EXPORT_SYMBOL_GPL(exportfs_encode_fh); 355EXPORT_SYMBOL_GPL(exportfs_encode_fh);
511 356
512struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len, 357struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
513 int fileid_type, int (*acceptable)(void *, struct dentry *), 358 int fh_len, int fileid_type,
514 void *context) 359 int (*acceptable)(void *, struct dentry *), void *context)
515{ 360{
516 struct export_operations *nop = mnt->mnt_sb->s_export_op; 361 const struct export_operations *nop = mnt->mnt_sb->s_export_op;
517 struct dentry *result; 362 struct dentry *result, *alias;
363 int err;
518 364
519 if (nop->decode_fh) { 365 /*
520 result = nop->decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type, 366 * Try to get any dentry for the given file handle from the filesystem.
521 acceptable, context); 367 */
368 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
369 if (!result)
370 result = ERR_PTR(-ESTALE);
371 if (IS_ERR(result))
372 return result;
373
374 if (S_ISDIR(result->d_inode->i_mode)) {
375 /*
376 * This request is for a directory.
377 *
378 * On the positive side there is only one dentry for each
379 * directory inode. On the negative side this implies that we
380 * to ensure our dentry is connected all the way up to the
381 * filesystem root.
382 */
383 if (result->d_flags & DCACHE_DISCONNECTED) {
384 err = reconnect_path(mnt, result);
385 if (err)
386 goto err_result;
387 }
388
389 if (!acceptable(context, result)) {
390 err = -EACCES;
391 goto err_result;
392 }
393
394 return result;
522 } else { 395 } else {
523 result = export_decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type, 396 /*
524 acceptable, context); 397 * It's not a directory. Life is a little more complicated.
398 */
399 struct dentry *target_dir, *nresult;
400 char nbuf[NAME_MAX+1];
401
402 /*
403 * See if either the dentry we just got from the filesystem
404 * or any alias for it is acceptable. This is always true
405 * if this filesystem is exported without the subtreecheck
406 * option. If the filesystem is exported with the subtree
407 * check option there's a fair chance we need to look at
408 * the parent directory in the file handle and make sure
409 * it's connected to the filesystem root.
410 */
411 alias = find_acceptable_alias(result, acceptable, context);
412 if (alias)
413 return alias;
414
415 /*
416 * Try to extract a dentry for the parent directory from the
417 * file handle. If this fails we'll have to give up.
418 */
419 err = -ESTALE;
420 if (!nop->fh_to_parent)
421 goto err_result;
422
423 target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
424 fh_len, fileid_type);
425 if (!target_dir)
426 goto err_result;
427 err = PTR_ERR(target_dir);
428 if (IS_ERR(target_dir))
429 goto err_result;
430
431 /*
432 * And as usual we need to make sure the parent directory is
433 * connected to the filesystem root. The VFS really doesn't
434 * like disconnected directories..
435 */
436 err = reconnect_path(mnt, target_dir);
437 if (err) {
438 dput(target_dir);
439 goto err_result;
440 }
441
442 /*
443 * Now that we've got both a well-connected parent and a
444 * dentry for the inode we're after, make sure that our
445 * inode is actually connected to the parent.
446 */
447 err = exportfs_get_name(mnt, target_dir, nbuf, result);
448 if (!err) {
449 mutex_lock(&target_dir->d_inode->i_mutex);
450 nresult = lookup_one_len(nbuf, target_dir,
451 strlen(nbuf));
452 mutex_unlock(&target_dir->d_inode->i_mutex);
453 if (!IS_ERR(nresult)) {
454 if (nresult->d_inode) {
455 dput(result);
456 result = nresult;
457 } else
458 dput(nresult);
459 }
460 }
461
462 /*
463 * At this point we are done with the parent, but it's pinned
464 * by the child dentry anyway.
465 */
466 dput(target_dir);
467
468 /*
469 * And finally make sure the dentry is actually acceptable
470 * to NFSD.
471 */
472 alias = find_acceptable_alias(result, acceptable, context);
473 if (!alias) {
474 err = -EACCES;
475 goto err_result;
476 }
477
478 return alias;
525 } 479 }
526 480
527 return result; 481 err_result:
482 dput(result);
483 return ERR_PTR(err);
528} 484}
529EXPORT_SYMBOL_GPL(exportfs_decode_fh); 485EXPORT_SYMBOL_GPL(exportfs_decode_fh);
530 486
531EXPORT_SYMBOL(find_exported_dentry);
532
533MODULE_LICENSE("GPL"); 487MODULE_LICENSE("GPL");
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 05d9342bb64..d868e26c15e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,6 +28,24 @@
28 28
29typedef struct ext2_dir_entry_2 ext2_dirent; 29typedef struct ext2_dir_entry_2 ext2_dirent;
30 30
31static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
32{
33 unsigned len = le16_to_cpu(dlen);
34
35 if (len == EXT2_MAX_REC_LEN)
36 return 1 << 16;
37 return len;
38}
39
40static inline __le16 ext2_rec_len_to_disk(unsigned len)
41{
42 if (len == (1 << 16))
43 return cpu_to_le16(EXT2_MAX_REC_LEN);
44 else if (len > (1 << 16))
45 BUG();
46 return cpu_to_le16(len);
47}
48
31/* 49/*
32 * ext2 uses block-sized chunks. Arguably, sector-sized ones would be 50 * ext2 uses block-sized chunks. Arguably, sector-sized ones would be
33 * more robust, but we have what we have 51 * more robust, but we have what we have
@@ -106,7 +124,7 @@ static void ext2_check_page(struct page *page)
106 } 124 }
107 for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) { 125 for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
108 p = (ext2_dirent *)(kaddr + offs); 126 p = (ext2_dirent *)(kaddr + offs);
109 rec_len = le16_to_cpu(p->rec_len); 127 rec_len = ext2_rec_len_from_disk(p->rec_len);
110 128
111 if (rec_len < EXT2_DIR_REC_LEN(1)) 129 if (rec_len < EXT2_DIR_REC_LEN(1))
112 goto Eshort; 130 goto Eshort;
@@ -204,7 +222,8 @@ static inline int ext2_match (int len, const char * const name,
204 */ 222 */
205static inline ext2_dirent *ext2_next_entry(ext2_dirent *p) 223static inline ext2_dirent *ext2_next_entry(ext2_dirent *p)
206{ 224{
207 return (ext2_dirent *)((char*)p + le16_to_cpu(p->rec_len)); 225 return (ext2_dirent *)((char *)p +
226 ext2_rec_len_from_disk(p->rec_len));
208} 227}
209 228
210static inline unsigned 229static inline unsigned
@@ -316,7 +335,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
316 return 0; 335 return 0;
317 } 336 }
318 } 337 }
319 filp->f_pos += le16_to_cpu(de->rec_len); 338 filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
320 } 339 }
321 ext2_put_page(page); 340 ext2_put_page(page);
322 } 341 }
@@ -425,7 +444,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
425{ 444{
426 loff_t pos = page_offset(page) + 445 loff_t pos = page_offset(page) +
427 (char *) de - (char *) page_address(page); 446 (char *) de - (char *) page_address(page);
428 unsigned len = le16_to_cpu(de->rec_len); 447 unsigned len = ext2_rec_len_from_disk(de->rec_len);
429 int err; 448 int err;
430 449
431 lock_page(page); 450 lock_page(page);
@@ -482,7 +501,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
482 /* We hit i_size */ 501 /* We hit i_size */
483 name_len = 0; 502 name_len = 0;
484 rec_len = chunk_size; 503 rec_len = chunk_size;
485 de->rec_len = cpu_to_le16(chunk_size); 504 de->rec_len = ext2_rec_len_to_disk(chunk_size);
486 de->inode = 0; 505 de->inode = 0;
487 goto got_it; 506 goto got_it;
488 } 507 }
@@ -496,7 +515,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
496 if (ext2_match (namelen, name, de)) 515 if (ext2_match (namelen, name, de))
497 goto out_unlock; 516 goto out_unlock;
498 name_len = EXT2_DIR_REC_LEN(de->name_len); 517 name_len = EXT2_DIR_REC_LEN(de->name_len);
499 rec_len = le16_to_cpu(de->rec_len); 518 rec_len = ext2_rec_len_from_disk(de->rec_len);
500 if (!de->inode && rec_len >= reclen) 519 if (!de->inode && rec_len >= reclen)
501 goto got_it; 520 goto got_it;
502 if (rec_len >= name_len + reclen) 521 if (rec_len >= name_len + reclen)
@@ -518,8 +537,8 @@ got_it:
518 goto out_unlock; 537 goto out_unlock;
519 if (de->inode) { 538 if (de->inode) {
520 ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len); 539 ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
521 de1->rec_len = cpu_to_le16(rec_len - name_len); 540 de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
522 de->rec_len = cpu_to_le16(name_len); 541 de->rec_len = ext2_rec_len_to_disk(name_len);
523 de = de1; 542 de = de1;
524 } 543 }
525 de->name_len = namelen; 544 de->name_len = namelen;
@@ -550,7 +569,8 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
550 struct inode *inode = mapping->host; 569 struct inode *inode = mapping->host;
551 char *kaddr = page_address(page); 570 char *kaddr = page_address(page);
552 unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); 571 unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
553 unsigned to = ((char*)dir - kaddr) + le16_to_cpu(dir->rec_len); 572 unsigned to = ((char *)dir - kaddr) +
573 ext2_rec_len_from_disk(dir->rec_len);
554 loff_t pos; 574 loff_t pos;
555 ext2_dirent * pde = NULL; 575 ext2_dirent * pde = NULL;
556 ext2_dirent * de = (ext2_dirent *) (kaddr + from); 576 ext2_dirent * de = (ext2_dirent *) (kaddr + from);
@@ -574,7 +594,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
574 &page, NULL); 594 &page, NULL);
575 BUG_ON(err); 595 BUG_ON(err);
576 if (pde) 596 if (pde)
577 pde->rec_len = cpu_to_le16(to - from); 597 pde->rec_len = ext2_rec_len_to_disk(to - from);
578 dir->inode = 0; 598 dir->inode = 0;
579 err = ext2_commit_chunk(page, pos, to - from); 599 err = ext2_commit_chunk(page, pos, to - from);
580 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 600 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
@@ -610,14 +630,14 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
610 memset(kaddr, 0, chunk_size); 630 memset(kaddr, 0, chunk_size);
611 de = (struct ext2_dir_entry_2 *)kaddr; 631 de = (struct ext2_dir_entry_2 *)kaddr;
612 de->name_len = 1; 632 de->name_len = 1;
613 de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1)); 633 de->rec_len = ext2_rec_len_to_disk(EXT2_DIR_REC_LEN(1));
614 memcpy (de->name, ".\0\0", 4); 634 memcpy (de->name, ".\0\0", 4);
615 de->inode = cpu_to_le32(inode->i_ino); 635 de->inode = cpu_to_le32(inode->i_ino);
616 ext2_set_de_type (de, inode); 636 ext2_set_de_type (de, inode);
617 637
618 de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1)); 638 de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1));
619 de->name_len = 2; 639 de->name_len = 2;
620 de->rec_len = cpu_to_le16(chunk_size - EXT2_DIR_REC_LEN(1)); 640 de->rec_len = ext2_rec_len_to_disk(chunk_size - EXT2_DIR_REC_LEN(1));
621 de->inode = cpu_to_le32(parent->i_ino); 641 de->inode = cpu_to_le32(parent->i_ino);
622 memcpy (de->name, "..\0", 4); 642 memcpy (de->name, "..\0", 4);
623 ext2_set_de_type (de, inode); 643 ext2_set_de_type (de, inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 77bd5f9262f..154e25f13d7 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -311,13 +311,10 @@ static const struct super_operations ext2_sops = {
311#endif 311#endif
312}; 312};
313 313
314static struct dentry *ext2_get_dentry(struct super_block *sb, void *vobjp) 314static struct inode *ext2_nfs_get_inode(struct super_block *sb,
315 u64 ino, u32 generation)
315{ 316{
316 __u32 *objp = vobjp;
317 unsigned long ino = objp[0];
318 __u32 generation = objp[1];
319 struct inode *inode; 317 struct inode *inode;
320 struct dentry *result;
321 318
322 if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO) 319 if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO)
323 return ERR_PTR(-ESTALE); 320 return ERR_PTR(-ESTALE);
@@ -338,15 +335,21 @@ static struct dentry *ext2_get_dentry(struct super_block *sb, void *vobjp)
338 iput(inode); 335 iput(inode);
339 return ERR_PTR(-ESTALE); 336 return ERR_PTR(-ESTALE);
340 } 337 }
341 /* now to find a dentry. 338 return inode;
342 * If possible, get a well-connected one 339}
343 */ 340
344 result = d_alloc_anon(inode); 341static struct dentry *ext2_fh_to_dentry(struct super_block *sb, struct fid *fid,
345 if (!result) { 342 int fh_len, int fh_type)
346 iput(inode); 343{
347 return ERR_PTR(-ENOMEM); 344 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
348 } 345 ext2_nfs_get_inode);
349 return result; 346}
347
348static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
349 int fh_len, int fh_type)
350{
351 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
352 ext2_nfs_get_inode);
350} 353}
351 354
352/* Yes, most of these are left as NULL!! 355/* Yes, most of these are left as NULL!!
@@ -354,9 +357,10 @@ static struct dentry *ext2_get_dentry(struct super_block *sb, void *vobjp)
354 * systems, but can be improved upon. 357 * systems, but can be improved upon.
355 * Currently only get_parent is required. 358 * Currently only get_parent is required.
356 */ 359 */
357static struct export_operations ext2_export_ops = { 360static const struct export_operations ext2_export_ops = {
361 .fh_to_dentry = ext2_fh_to_dentry,
362 .fh_to_parent = ext2_fh_to_parent,
358 .get_parent = ext2_get_parent, 363 .get_parent = ext2_get_parent,
359 .get_dentry = ext2_get_dentry,
360}; 364};
361 365
362static unsigned long get_sb_block(void **data) 366static unsigned long get_sb_block(void **data)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 81868c0bc40..de55da9e28b 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -631,13 +631,10 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
631} 631}
632 632
633 633
634static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp) 634static struct inode *ext3_nfs_get_inode(struct super_block *sb,
635 u64 ino, u32 generation)
635{ 636{
636 __u32 *objp = vobjp;
637 unsigned long ino = objp[0];
638 __u32 generation = objp[1];
639 struct inode *inode; 637 struct inode *inode;
640 struct dentry *result;
641 638
642 if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO) 639 if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
643 return ERR_PTR(-ESTALE); 640 return ERR_PTR(-ESTALE);
@@ -660,15 +657,22 @@ static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp)
660 iput(inode); 657 iput(inode);
661 return ERR_PTR(-ESTALE); 658 return ERR_PTR(-ESTALE);
662 } 659 }
663 /* now to find a dentry. 660
664 * If possible, get a well-connected one 661 return inode;
665 */ 662}
666 result = d_alloc_anon(inode); 663
667 if (!result) { 664static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
668 iput(inode); 665 int fh_len, int fh_type)
669 return ERR_PTR(-ENOMEM); 666{
670 } 667 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
671 return result; 668 ext3_nfs_get_inode);
669}
670
671static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
672 int fh_len, int fh_type)
673{
674 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
675 ext3_nfs_get_inode);
672} 676}
673 677
674#ifdef CONFIG_QUOTA 678#ifdef CONFIG_QUOTA
@@ -737,9 +741,10 @@ static const struct super_operations ext3_sops = {
737#endif 741#endif
738}; 742};
739 743
740static struct export_operations ext3_export_ops = { 744static const struct export_operations ext3_export_ops = {
745 .fh_to_dentry = ext3_fh_to_dentry,
746 .fh_to_parent = ext3_fh_to_parent,
741 .get_parent = ext3_get_parent, 747 .get_parent = ext3_get_parent,
742 .get_dentry = ext3_get_dentry,
743}; 748};
744 749
745enum { 750enum {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b11e9e2bcd0..8031dc0e24e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -686,13 +686,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
686} 686}
687 687
688 688
689static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp) 689static struct inode *ext4_nfs_get_inode(struct super_block *sb,
690 u64 ino, u32 generation)
690{ 691{
691 __u32 *objp = vobjp;
692 unsigned long ino = objp[0];
693 __u32 generation = objp[1];
694 struct inode *inode; 692 struct inode *inode;
695 struct dentry *result;
696 693
697 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) 694 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
698 return ERR_PTR(-ESTALE); 695 return ERR_PTR(-ESTALE);
@@ -715,15 +712,22 @@ static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
715 iput(inode); 712 iput(inode);
716 return ERR_PTR(-ESTALE); 713 return ERR_PTR(-ESTALE);
717 } 714 }
718 /* now to find a dentry. 715
719 * If possible, get a well-connected one 716 return inode;
720 */ 717}
721 result = d_alloc_anon(inode); 718
722 if (!result) { 719static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
723 iput(inode); 720 int fh_len, int fh_type)
724 return ERR_PTR(-ENOMEM); 721{
725 } 722 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
726 return result; 723 ext4_nfs_get_inode);
724}
725
726static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
727 int fh_len, int fh_type)
728{
729 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
730 ext4_nfs_get_inode);
727} 731}
728 732
729#ifdef CONFIG_QUOTA 733#ifdef CONFIG_QUOTA
@@ -792,9 +796,10 @@ static const struct super_operations ext4_sops = {
792#endif 796#endif
793}; 797};
794 798
795static struct export_operations ext4_export_ops = { 799static const struct export_operations ext4_export_ops = {
800 .fh_to_dentry = ext4_fh_to_dentry,
801 .fh_to_parent = ext4_fh_to_parent,
796 .get_parent = ext4_get_parent, 802 .get_parent = ext4_get_parent,
797 .get_dentry = ext4_get_dentry,
798}; 803};
799 804
800enum { 805enum {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c0c5e9c55b5..920a576e1c2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -653,24 +653,15 @@ static const struct super_operations fat_sops = {
653 * of i_logstart is used to store the directory entry offset. 653 * of i_logstart is used to store the directory entry offset.
654 */ 654 */
655 655
656static struct dentry * 656static struct dentry *fat_fh_to_dentry(struct super_block *sb,
657fat_decode_fh(struct super_block *sb, __u32 *fh, int len, int fhtype, 657 struct fid *fid, int fh_len, int fh_type)
658 int (*acceptable)(void *context, struct dentry *de),
659 void *context)
660{
661 if (fhtype != 3)
662 return ERR_PTR(-ESTALE);
663 if (len < 5)
664 return ERR_PTR(-ESTALE);
665
666 return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable, context);
667}
668
669static struct dentry *fat_get_dentry(struct super_block *sb, void *inump)
670{ 658{
671 struct inode *inode = NULL; 659 struct inode *inode = NULL;
672 struct dentry *result; 660 struct dentry *result;
673 __u32 *fh = inump; 661 u32 *fh = fid->raw;
662
663 if (fh_len < 5 || fh_type != 3)
664 return NULL;
674 665
675 inode = iget(sb, fh[0]); 666 inode = iget(sb, fh[0]);
676 if (!inode || is_bad_inode(inode) || inode->i_generation != fh[1]) { 667 if (!inode || is_bad_inode(inode) || inode->i_generation != fh[1]) {
@@ -783,10 +774,9 @@ out:
783 return parent; 774 return parent;
784} 775}
785 776
786static struct export_operations fat_export_ops = { 777static const struct export_operations fat_export_ops = {
787 .decode_fh = fat_decode_fh,
788 .encode_fh = fat_encode_fh, 778 .encode_fh = fat_encode_fh,
789 .get_dentry = fat_get_dentry, 779 .fh_to_dentry = fat_fh_to_dentry,
790 .get_parent = fat_get_parent, 780 .get_parent = fat_get_parent,
791}; 781};
792 782
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index e2d1347796a..b9da62348a8 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -31,40 +31,6 @@
31#define GFS2_LARGE_FH_SIZE 8 31#define GFS2_LARGE_FH_SIZE 8
32#define GFS2_OLD_FH_SIZE 10 32#define GFS2_OLD_FH_SIZE 10
33 33
34static struct dentry *gfs2_decode_fh(struct super_block *sb,
35 __u32 *p,
36 int fh_len,
37 int fh_type,
38 int (*acceptable)(void *context,
39 struct dentry *dentry),
40 void *context)
41{
42 __be32 *fh = (__force __be32 *)p;
43 struct gfs2_inum_host inum, parent;
44
45 memset(&parent, 0, sizeof(struct gfs2_inum));
46
47 switch (fh_len) {
48 case GFS2_LARGE_FH_SIZE:
49 case GFS2_OLD_FH_SIZE:
50 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
51 parent.no_formal_ino |= be32_to_cpu(fh[5]);
52 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
53 parent.no_addr |= be32_to_cpu(fh[7]);
54 case GFS2_SMALL_FH_SIZE:
55 inum.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
56 inum.no_formal_ino |= be32_to_cpu(fh[1]);
57 inum.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
58 inum.no_addr |= be32_to_cpu(fh[3]);
59 break;
60 default:
61 return NULL;
62 }
63
64 return gfs2_export_ops.find_exported_dentry(sb, &inum, &parent,
65 acceptable, context);
66}
67
68static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, 34static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
69 int connectable) 35 int connectable)
70{ 36{
@@ -189,10 +155,10 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
189 return dentry; 155 return dentry;
190} 156}
191 157
192static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) 158static struct dentry *gfs2_get_dentry(struct super_block *sb,
159 struct gfs2_inum_host *inum)
193{ 160{
194 struct gfs2_sbd *sdp = sb->s_fs_info; 161 struct gfs2_sbd *sdp = sb->s_fs_info;
195 struct gfs2_inum_host *inum = inum_obj;
196 struct gfs2_holder i_gh, ri_gh, rgd_gh; 162 struct gfs2_holder i_gh, ri_gh, rgd_gh;
197 struct gfs2_rgrpd *rgd; 163 struct gfs2_rgrpd *rgd;
198 struct inode *inode; 164 struct inode *inode;
@@ -289,11 +255,50 @@ fail:
289 return ERR_PTR(error); 255 return ERR_PTR(error);
290} 256}
291 257
292struct export_operations gfs2_export_ops = { 258static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
293 .decode_fh = gfs2_decode_fh, 259 int fh_len, int fh_type)
260{
261 struct gfs2_inum_host this;
262 __be32 *fh = (__force __be32 *)fid->raw;
263
264 switch (fh_type) {
265 case GFS2_SMALL_FH_SIZE:
266 case GFS2_LARGE_FH_SIZE:
267 case GFS2_OLD_FH_SIZE:
268 this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
269 this.no_formal_ino |= be32_to_cpu(fh[1]);
270 this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
271 this.no_addr |= be32_to_cpu(fh[3]);
272 return gfs2_get_dentry(sb, &this);
273 default:
274 return NULL;
275 }
276}
277
278static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
279 int fh_len, int fh_type)
280{
281 struct gfs2_inum_host parent;
282 __be32 *fh = (__force __be32 *)fid->raw;
283
284 switch (fh_type) {
285 case GFS2_LARGE_FH_SIZE:
286 case GFS2_OLD_FH_SIZE:
287 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
288 parent.no_formal_ino |= be32_to_cpu(fh[5]);
289 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
290 parent.no_addr |= be32_to_cpu(fh[7]);
291 return gfs2_get_dentry(sb, &parent);
292 default:
293 return NULL;
294 }
295}
296
297const struct export_operations gfs2_export_ops = {
294 .encode_fh = gfs2_encode_fh, 298 .encode_fh = gfs2_encode_fh,
299 .fh_to_dentry = gfs2_fh_to_dentry,
300 .fh_to_parent = gfs2_fh_to_parent,
295 .get_name = gfs2_get_name, 301 .get_name = gfs2_get_name,
296 .get_parent = gfs2_get_parent, 302 .get_parent = gfs2_get_parent,
297 .get_dentry = gfs2_get_dentry,
298}; 303};
299 304
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
index 407029b3b2b..da849051183 100644
--- a/fs/gfs2/ops_fstype.h
+++ b/fs/gfs2/ops_fstype.h
@@ -14,6 +14,6 @@
14 14
15extern struct file_system_type gfs2_fs_type; 15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type; 16extern struct file_system_type gfs2meta_fs_type;
17extern struct export_operations gfs2_export_ops; 17extern const struct export_operations gfs2_export_ops;
18 18
19#endif /* __OPS_FSTYPE_DOT_H__ */ 19#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 4af856a7fda..29f9753ae5e 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -42,16 +42,6 @@ isofs_export_iget(struct super_block *sb,
42 return result; 42 return result;
43} 43}
44 44
45static struct dentry *
46isofs_export_get_dentry(struct super_block *sb, void *vobjp)
47{
48 __u32 *objp = vobjp;
49 unsigned long block = objp[0];
50 unsigned long offset = objp[1];
51 __u32 generation = objp[2];
52 return isofs_export_iget(sb, block, offset, generation);
53}
54
55/* This function is surprisingly simple. The trick is understanding 45/* This function is surprisingly simple. The trick is understanding
56 * that "child" is always a directory. So, to find its parent, you 46 * that "child" is always a directory. So, to find its parent, you
57 * simply need to find its ".." entry, normalize its block and offset, 47 * simply need to find its ".." entry, normalize its block and offset,
@@ -182,43 +172,44 @@ isofs_export_encode_fh(struct dentry *dentry,
182 return type; 172 return type;
183} 173}
184 174
175struct isofs_fid {
176 u32 block;
177 u16 offset;
178 u16 parent_offset;
179 u32 generation;
180 u32 parent_block;
181 u32 parent_generation;
182};
185 183
186static struct dentry * 184static struct dentry *isofs_fh_to_dentry(struct super_block *sb,
187isofs_export_decode_fh(struct super_block *sb, 185 struct fid *fid, int fh_len, int fh_type)
188 __u32 *fh32,
189 int fh_len,
190 int fileid_type,
191 int (*acceptable)(void *context, struct dentry *de),
192 void *context)
193{ 186{
194 __u16 *fh16 = (__u16*)fh32; 187 struct isofs_fid *ifid = (struct isofs_fid *)fid;
195 __u32 child[3]; /* The child is what triggered all this. */
196 __u32 parent[3]; /* The parent is just along for the ride. */
197 188
198 if (fh_len < 3 || fileid_type > 2) 189 if (fh_len < 3 || fh_type > 2)
199 return NULL; 190 return NULL;
200 191
201 child[0] = fh32[0]; 192 return isofs_export_iget(sb, ifid->block, ifid->offset,
202 child[1] = fh16[2]; /* fh16 [sic] */ 193 ifid->generation);
203 child[2] = fh32[2];
204
205 parent[0] = 0;
206 parent[1] = 0;
207 parent[2] = 0;
208 if (fileid_type == 2) {
209 if (fh_len > 2) parent[0] = fh32[3];
210 parent[1] = fh16[3]; /* fh16 [sic] */
211 if (fh_len > 4) parent[2] = fh32[4];
212 }
213
214 return sb->s_export_op->find_exported_dentry(sb, child, parent,
215 acceptable, context);
216} 194}
217 195
196static struct dentry *isofs_fh_to_parent(struct super_block *sb,
197 struct fid *fid, int fh_len, int fh_type)
198{
199 struct isofs_fid *ifid = (struct isofs_fid *)fid;
200
201 if (fh_type != 2)
202 return NULL;
203
204 return isofs_export_iget(sb,
205 fh_len > 2 ? ifid->parent_block : 0,
206 ifid->parent_offset,
207 fh_len > 4 ? ifid->parent_generation : 0);
208}
218 209
219struct export_operations isofs_export_ops = { 210const struct export_operations isofs_export_ops = {
220 .decode_fh = isofs_export_decode_fh,
221 .encode_fh = isofs_export_encode_fh, 211 .encode_fh = isofs_export_encode_fh,
222 .get_dentry = isofs_export_get_dentry, 212 .fh_to_dentry = isofs_fh_to_dentry,
213 .fh_to_parent = isofs_fh_to_parent,
223 .get_parent = isofs_export_get_parent, 214 .get_parent = isofs_export_get_parent,
224}; 215};
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index a07e67b1ea7..f3213f9f89a 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -178,4 +178,4 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
178extern const struct inode_operations isofs_dir_inode_operations; 178extern const struct inode_operations isofs_dir_inode_operations;
179extern const struct file_operations isofs_dir_operations; 179extern const struct file_operations isofs_dir_operations;
180extern const struct address_space_operations isofs_symlink_aops; 180extern const struct address_space_operations isofs_symlink_aops;
181extern struct export_operations isofs_export_ops; 181extern const struct export_operations isofs_export_ops;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index f0ec72b263f..8e2cf2cde18 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -18,6 +18,8 @@
18#ifndef _H_JFS_INODE 18#ifndef _H_JFS_INODE
19#define _H_JFS_INODE 19#define _H_JFS_INODE
20 20
21struct fid;
22
21extern struct inode *ialloc(struct inode *, umode_t); 23extern struct inode *ialloc(struct inode *, umode_t);
22extern int jfs_fsync(struct file *, struct dentry *, int); 24extern int jfs_fsync(struct file *, struct dentry *, int);
23extern int jfs_ioctl(struct inode *, struct file *, 25extern int jfs_ioctl(struct inode *, struct file *,
@@ -32,7 +34,10 @@ extern void jfs_truncate_nolock(struct inode *, loff_t);
32extern void jfs_free_zero_link(struct inode *); 34extern void jfs_free_zero_link(struct inode *);
33extern struct dentry *jfs_get_parent(struct dentry *dentry); 35extern struct dentry *jfs_get_parent(struct dentry *dentry);
34extern void jfs_get_inode_flags(struct jfs_inode_info *); 36extern void jfs_get_inode_flags(struct jfs_inode_info *);
35extern struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp); 37extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
38 int fh_len, int fh_type);
39extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
40 int fh_len, int fh_type);
36extern void jfs_set_inode_flags(struct inode *); 41extern void jfs_set_inode_flags(struct inode *);
37extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
38 43
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 932797ba433..4e0a8493cef 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -20,6 +20,7 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/quotaops.h> 22#include <linux/quotaops.h>
23#include <linux/exportfs.h>
23#include "jfs_incore.h" 24#include "jfs_incore.h"
24#include "jfs_superblock.h" 25#include "jfs_superblock.h"
25#include "jfs_inode.h" 26#include "jfs_inode.h"
@@ -1477,13 +1478,10 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1477 return dentry; 1478 return dentry;
1478} 1479}
1479 1480
1480struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp) 1481static struct inode *jfs_nfs_get_inode(struct super_block *sb,
1482 u64 ino, u32 generation)
1481{ 1483{
1482 __u32 *objp = vobjp;
1483 unsigned long ino = objp[0];
1484 __u32 generation = objp[1];
1485 struct inode *inode; 1484 struct inode *inode;
1486 struct dentry *result;
1487 1485
1488 if (ino == 0) 1486 if (ino == 0)
1489 return ERR_PTR(-ESTALE); 1487 return ERR_PTR(-ESTALE);
@@ -1493,20 +1491,25 @@ struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp)
1493 1491
1494 if (is_bad_inode(inode) || 1492 if (is_bad_inode(inode) ||
1495 (generation && inode->i_generation != generation)) { 1493 (generation && inode->i_generation != generation)) {
1496 result = ERR_PTR(-ESTALE); 1494 iput(inode);
1497 goto out_iput; 1495 return ERR_PTR(-ESTALE);
1498 } 1496 }
1499 1497
1500 result = d_alloc_anon(inode); 1498 return inode;
1501 if (!result) { 1499}
1502 result = ERR_PTR(-ENOMEM);
1503 goto out_iput;
1504 }
1505 return result;
1506 1500
1507 out_iput: 1501struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1508 iput(inode); 1502 int fh_len, int fh_type)
1509 return result; 1503{
1504 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1505 jfs_nfs_get_inode);
1506}
1507
1508struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1509 int fh_len, int fh_type)
1510{
1511 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1512 jfs_nfs_get_inode);
1510} 1513}
1511 1514
1512struct dentry *jfs_get_parent(struct dentry *dentry) 1515struct dentry *jfs_get_parent(struct dentry *dentry)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index cff60c17194..314bb4ff1ba 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -48,7 +48,7 @@ MODULE_LICENSE("GPL");
48static struct kmem_cache * jfs_inode_cachep; 48static struct kmem_cache * jfs_inode_cachep;
49 49
50static const struct super_operations jfs_super_operations; 50static const struct super_operations jfs_super_operations;
51static struct export_operations jfs_export_operations; 51static const struct export_operations jfs_export_operations;
52static struct file_system_type jfs_fs_type; 52static struct file_system_type jfs_fs_type;
53 53
54#define MAX_COMMIT_THREADS 64 54#define MAX_COMMIT_THREADS 64
@@ -737,8 +737,9 @@ static const struct super_operations jfs_super_operations = {
737#endif 737#endif
738}; 738};
739 739
740static struct export_operations jfs_export_operations = { 740static const struct export_operations jfs_export_operations = {
741 .get_dentry = jfs_get_dentry, 741 .fh_to_dentry = jfs_fh_to_dentry,
742 .fh_to_parent = jfs_fh_to_parent,
742 .get_parent = jfs_get_parent, 743 .get_parent = jfs_get_parent,
743}; 744};
744 745
diff --git a/fs/libfs.c b/fs/libfs.c
index ae51481e45e..6e68b700958 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
8#include <linux/mount.h> 8#include <linux/mount.h>
9#include <linux/vfs.h> 9#include <linux/vfs.h>
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/exportfs.h>
11 12
12#include <asm/uaccess.h> 13#include <asm/uaccess.h>
13 14
@@ -678,6 +679,93 @@ out:
678 return ret; 679 return ret;
679} 680}
680 681
682/*
683 * This is what d_alloc_anon should have been. Once the exportfs
684 * argument transition has been finished I will update d_alloc_anon
685 * to this prototype and this wrapper will go away. --hch
686 */
687static struct dentry *exportfs_d_alloc(struct inode *inode)
688{
689 struct dentry *dentry;
690
691 if (!inode)
692 return NULL;
693 if (IS_ERR(inode))
694 return ERR_PTR(PTR_ERR(inode));
695
696 dentry = d_alloc_anon(inode);
697 if (!dentry) {
698 iput(inode);
699 dentry = ERR_PTR(-ENOMEM);
700 }
701 return dentry;
702}
703
704/**
705 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
706 * @sb: filesystem to do the file handle conversion on
707 * @fid: file handle to convert
708 * @fh_len: length of the file handle in bytes
709 * @fh_type: type of file handle
710 * @get_inode: filesystem callback to retrieve inode
711 *
712 * This function decodes @fid as long as it has one of the well-known
713 * Linux filehandle types and calls @get_inode on it to retrieve the
714 * inode for the object specified in the file handle.
715 */
716struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
717 int fh_len, int fh_type, struct inode *(*get_inode)
718 (struct super_block *sb, u64 ino, u32 gen))
719{
720 struct inode *inode = NULL;
721
722 if (fh_len < 2)
723 return NULL;
724
725 switch (fh_type) {
726 case FILEID_INO32_GEN:
727 case FILEID_INO32_GEN_PARENT:
728 inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
729 break;
730 }
731
732 return exportfs_d_alloc(inode);
733}
734EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
735
736/**
737 * generic_fh_to_dentry - generic helper for the fh_to_parent export operation
738 * @sb: filesystem to do the file handle conversion on
739 * @fid: file handle to convert
740 * @fh_len: length of the file handle in bytes
741 * @fh_type: type of file handle
742 * @get_inode: filesystem callback to retrieve inode
743 *
744 * This function decodes @fid as long as it has one of the well-known
745 * Linux filehandle types and calls @get_inode on it to retrieve the
746 * inode for the _parent_ object specified in the file handle if it
747 * is specified in the file handle, or NULL otherwise.
748 */
749struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
750 int fh_len, int fh_type, struct inode *(*get_inode)
751 (struct super_block *sb, u64 ino, u32 gen))
752{
753 struct inode *inode = NULL;
754
755 if (fh_len <= 2)
756 return NULL;
757
758 switch (fh_type) {
759 case FILEID_INO32_GEN_PARENT:
760 inode = get_inode(sb, fid->i32.parent_ino,
761 (fh_len > 3 ? fid->i32.parent_gen : 0));
762 break;
763 }
764
765 return exportfs_d_alloc(inode);
766}
767EXPORT_SYMBOL_GPL(generic_fh_to_parent);
768
681EXPORT_SYMBOL(dcache_dir_close); 769EXPORT_SYMBOL(dcache_dir_close);
682EXPORT_SYMBOL(dcache_dir_lseek); 770EXPORT_SYMBOL(dcache_dir_lseek);
683EXPORT_SYMBOL(dcache_dir_open); 771EXPORT_SYMBOL(dcache_dir_open);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 04b26672980..66d0aeb32a4 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -386,15 +386,13 @@ static int check_export(struct inode *inode, int flags, unsigned char *uuid)
386 dprintk("exp_export: export of non-dev fs without fsid\n"); 386 dprintk("exp_export: export of non-dev fs without fsid\n");
387 return -EINVAL; 387 return -EINVAL;
388 } 388 }
389 if (!inode->i_sb->s_export_op) { 389
390 if (!inode->i_sb->s_export_op ||
391 !inode->i_sb->s_export_op->fh_to_dentry) {
390 dprintk("exp_export: export of invalid fs type.\n"); 392 dprintk("exp_export: export of invalid fs type.\n");
391 return -EINVAL; 393 return -EINVAL;
392 } 394 }
393 395
394 /* Ok, we can export it */;
395 if (!inode->i_sb->s_export_op->find_exported_dentry)
396 inode->i_sb->s_export_op->find_exported_dentry =
397 find_exported_dentry;
398 return 0; 396 return 0;
399 397
400} 398}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 7011d62acfc..4f712e97058 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -115,8 +115,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
115 dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); 115 dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
116 116
117 if (!fhp->fh_dentry) { 117 if (!fhp->fh_dentry) {
118 __u32 *datap=NULL; 118 struct fid *fid = NULL, sfid;
119 __u32 tfh[3]; /* filehandle fragment for oldstyle filehandles */
120 int fileid_type; 119 int fileid_type;
121 int data_left = fh->fh_size/4; 120 int data_left = fh->fh_size/4;
122 121
@@ -128,7 +127,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
128 127
129 if (fh->fh_version == 1) { 128 if (fh->fh_version == 1) {
130 int len; 129 int len;
131 datap = fh->fh_auth;
132 if (--data_left<0) goto out; 130 if (--data_left<0) goto out;
133 switch (fh->fh_auth_type) { 131 switch (fh->fh_auth_type) {
134 case 0: break; 132 case 0: break;
@@ -144,9 +142,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
144 fh->fh_fsid[1] = fh->fh_fsid[2]; 142 fh->fh_fsid[1] = fh->fh_fsid[2];
145 } 143 }
146 if ((data_left -= len)<0) goto out; 144 if ((data_left -= len)<0) goto out;
147 exp = rqst_exp_find(rqstp, fh->fh_fsid_type, datap); 145 exp = rqst_exp_find(rqstp, fh->fh_fsid_type,
148 datap += len; 146 fh->fh_auth);
147 fid = (struct fid *)(fh->fh_auth + len);
149 } else { 148 } else {
149 __u32 tfh[2];
150 dev_t xdev; 150 dev_t xdev;
151 ino_t xino; 151 ino_t xino;
152 if (fh->fh_size != NFS_FHSIZE) 152 if (fh->fh_size != NFS_FHSIZE)
@@ -190,22 +190,22 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
190 error = nfserr_badhandle; 190 error = nfserr_badhandle;
191 191
192 if (fh->fh_version != 1) { 192 if (fh->fh_version != 1) {
193 tfh[0] = fh->ofh_ino; 193 sfid.i32.ino = fh->ofh_ino;
194 tfh[1] = fh->ofh_generation; 194 sfid.i32.gen = fh->ofh_generation;
195 tfh[2] = fh->ofh_dirino; 195 sfid.i32.parent_ino = fh->ofh_dirino;
196 datap = tfh; 196 fid = &sfid;
197 data_left = 3; 197 data_left = 3;
198 if (fh->ofh_dirino == 0) 198 if (fh->ofh_dirino == 0)
199 fileid_type = 1; 199 fileid_type = FILEID_INO32_GEN;
200 else 200 else
201 fileid_type = 2; 201 fileid_type = FILEID_INO32_GEN_PARENT;
202 } else 202 } else
203 fileid_type = fh->fh_fileid_type; 203 fileid_type = fh->fh_fileid_type;
204 204
205 if (fileid_type == 0) 205 if (fileid_type == FILEID_ROOT)
206 dentry = dget(exp->ex_dentry); 206 dentry = dget(exp->ex_dentry);
207 else { 207 else {
208 dentry = exportfs_decode_fh(exp->ex_mnt, datap, 208 dentry = exportfs_decode_fh(exp->ex_mnt, fid,
209 data_left, fileid_type, 209 data_left, fileid_type,
210 nfsd_acceptable, exp); 210 nfsd_acceptable, exp);
211 } 211 }
@@ -286,16 +286,21 @@ out:
286 * an inode. In this case a call to fh_update should be made 286 * an inode. In this case a call to fh_update should be made
287 * before the fh goes out on the wire ... 287 * before the fh goes out on the wire ...
288 */ 288 */
289static inline int _fh_update(struct dentry *dentry, struct svc_export *exp, 289static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
290 __u32 *datap, int *maxsize) 290 struct dentry *dentry)
291{ 291{
292 if (dentry == exp->ex_dentry) { 292 if (dentry != exp->ex_dentry) {
293 *maxsize = 0; 293 struct fid *fid = (struct fid *)
294 return 0; 294 (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1);
295 } 295 int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
296 int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK);
296 297
297 return exportfs_encode_fh(dentry, datap, maxsize, 298 fhp->fh_handle.fh_fileid_type =
298 !(exp->ex_flags & NFSEXP_NOSUBTREECHECK)); 299 exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck);
300 fhp->fh_handle.fh_size += maxsize * 4;
301 } else {
302 fhp->fh_handle.fh_fileid_type = FILEID_ROOT;
303 }
299} 304}
300 305
301/* 306/*
@@ -457,12 +462,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
457 datap += len/4; 462 datap += len/4;
458 fhp->fh_handle.fh_size = 4 + len; 463 fhp->fh_handle.fh_size = 4 + len;
459 464
460 if (inode) { 465 if (inode)
461 int size = (fhp->fh_maxsize-len-4)/4; 466 _fh_update(fhp, exp, dentry);
462 fhp->fh_handle.fh_fileid_type =
463 _fh_update(dentry, exp, datap, &size);
464 fhp->fh_handle.fh_size += size*4;
465 }
466 if (fhp->fh_handle.fh_fileid_type == 255) 467 if (fhp->fh_handle.fh_fileid_type == 255)
467 return nfserr_opnotsupp; 468 return nfserr_opnotsupp;
468 } 469 }
@@ -479,7 +480,6 @@ __be32
479fh_update(struct svc_fh *fhp) 480fh_update(struct svc_fh *fhp)
480{ 481{
481 struct dentry *dentry; 482 struct dentry *dentry;
482 __u32 *datap;
483 483
484 if (!fhp->fh_dentry) 484 if (!fhp->fh_dentry)
485 goto out_bad; 485 goto out_bad;
@@ -490,15 +490,10 @@ fh_update(struct svc_fh *fhp)
490 if (fhp->fh_handle.fh_version != 1) { 490 if (fhp->fh_handle.fh_version != 1) {
491 _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); 491 _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
492 } else { 492 } else {
493 int size; 493 if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
494 if (fhp->fh_handle.fh_fileid_type != 0)
495 goto out; 494 goto out;
496 datap = fhp->fh_handle.fh_auth+ 495
497 fhp->fh_handle.fh_size/4 -1; 496 _fh_update(fhp, fhp->fh_export, dentry);
498 size = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
499 fhp->fh_handle.fh_fileid_type =
500 _fh_update(dentry, fhp->fh_export, datap, &size);
501 fhp->fh_handle.fh_size += size*4;
502 if (fhp->fh_handle.fh_fileid_type == 255) 497 if (fhp->fh_handle.fh_fileid_type == 255)
503 return nfserr_opnotsupp; 498 return nfserr_opnotsupp;
504 } 499 }
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e93c6142b23..e1781c8b165 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -450,58 +450,40 @@ try_next:
450 return parent_dent; 450 return parent_dent;
451} 451}
452 452
453/** 453static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
454 * ntfs_get_dentry - find a dentry for the inode from a file handle sub-fragment 454 u64 ino, u32 generation)
455 * @sb: super block identifying the mounted ntfs volume
456 * @fh: the file handle sub-fragment
457 *
458 * Find a dentry for the inode given a file handle sub-fragment. This function
459 * is called from fs/exportfs/expfs.c::find_exported_dentry() which in turn is
460 * called from the default ->decode_fh() which is export_decode_fh() in the
461 * same file. The code is closely based on the default ->get_dentry() helper
462 * fs/exportfs/expfs.c::get_object().
463 *
464 * The @fh contains two 32-bit unsigned values, the first one is the inode
465 * number and the second one is the inode generation.
466 *
467 * Return the dentry on success or the error code on error (IS_ERR() is true).
468 */
469static struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh)
470{ 455{
471 struct inode *vi; 456 struct inode *inode;
472 struct dentry *dent;
473 unsigned long ino = ((u32 *)fh)[0];
474 u32 gen = ((u32 *)fh)[1];
475 457
476 ntfs_debug("Entering for inode 0x%lx, generation 0x%x.", ino, gen); 458 inode = ntfs_iget(sb, ino);
477 vi = ntfs_iget(sb, ino); 459 if (!IS_ERR(inode)) {
478 if (IS_ERR(vi)) { 460 if (is_bad_inode(inode) || inode->i_generation != generation) {
479 ntfs_error(sb, "Failed to get inode 0x%lx.", ino); 461 iput(inode);
480 return (struct dentry *)vi; 462 inode = ERR_PTR(-ESTALE);
481 } 463 }
482 if (unlikely(is_bad_inode(vi) || vi->i_generation != gen)) {
483 /* We didn't find the right inode. */
484 ntfs_error(sb, "Inode 0x%lx, bad count: %d %d or version 0x%x "
485 "0x%x.", vi->i_ino, vi->i_nlink,
486 atomic_read(&vi->i_count), vi->i_generation,
487 gen);
488 iput(vi);
489 return ERR_PTR(-ESTALE);
490 }
491 /* Now find a dentry. If possible, get a well-connected one. */
492 dent = d_alloc_anon(vi);
493 if (unlikely(!dent)) {
494 iput(vi);
495 return ERR_PTR(-ENOMEM);
496 } 464 }
497 ntfs_debug("Done for inode 0x%lx, generation 0x%x.", ino, gen); 465
498 return dent; 466 return inode;
467}
468
469static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
470 int fh_len, int fh_type)
471{
472 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
473 ntfs_nfs_get_inode);
474}
475
476static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
477 int fh_len, int fh_type)
478{
479 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
480 ntfs_nfs_get_inode);
499} 481}
500 482
501/** 483/**
502 * Export operations allowing NFS exporting of mounted NTFS partitions. 484 * Export operations allowing NFS exporting of mounted NTFS partitions.
503 * 485 *
504 * We use the default ->decode_fh() and ->encode_fh() for now. Note that they 486 * We use the default ->encode_fh() for now. Note that they
505 * use 32 bits to store the inode number which is an unsigned long so on 64-bit 487 * use 32 bits to store the inode number which is an unsigned long so on 64-bit
506 * architectures is usually 64 bits so it would all fail horribly on huge 488 * architectures is usually 64 bits so it would all fail horribly on huge
507 * volumes. I guess we need to define our own encode and decode fh functions 489 * volumes. I guess we need to define our own encode and decode fh functions
@@ -517,10 +499,9 @@ static struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh)
517 * allowing the inode number 0 which is used in NTFS for the system file $MFT 499 * allowing the inode number 0 which is used in NTFS for the system file $MFT
518 * and due to using iget() whereas NTFS needs ntfs_iget(). 500 * and due to using iget() whereas NTFS needs ntfs_iget().
519 */ 501 */
520struct export_operations ntfs_export_ops = { 502const struct export_operations ntfs_export_ops = {
521 .get_parent = ntfs_get_parent, /* Find the parent of a given 503 .get_parent = ntfs_get_parent, /* Find the parent of a given
522 directory. */ 504 directory. */
523 .get_dentry = ntfs_get_dentry, /* Find a dentry for the inode 505 .fh_to_dentry = ntfs_fh_to_dentry,
524 given a file handle 506 .fh_to_parent = ntfs_fh_to_parent,
525 sub-fragment. */
526}; 507};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index d73f5a9ac34..d6a340bf80f 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -69,7 +69,7 @@ extern const struct inode_operations ntfs_dir_inode_ops;
69extern const struct file_operations ntfs_empty_file_ops; 69extern const struct file_operations ntfs_empty_file_ops;
70extern const struct inode_operations ntfs_empty_inode_ops; 70extern const struct inode_operations ntfs_empty_inode_ops;
71 71
72extern struct export_operations ntfs_export_ops; 72extern const struct export_operations ntfs_export_ops;
73 73
74/** 74/**
75 * NTFS_SB - return the ntfs volume given a vfs super block 75 * NTFS_SB - return the ntfs volume given a vfs super block
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index c3bbc198f9c..535bfa9568a 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -45,9 +45,9 @@ struct ocfs2_inode_handle
45 u32 ih_generation; 45 u32 ih_generation;
46}; 46};
47 47
48static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) 48static struct dentry *ocfs2_get_dentry(struct super_block *sb,
49 struct ocfs2_inode_handle *handle)
49{ 50{
50 struct ocfs2_inode_handle *handle = vobjp;
51 struct inode *inode; 51 struct inode *inode;
52 struct dentry *result; 52 struct dentry *result;
53 53
@@ -194,54 +194,37 @@ bail:
194 return type; 194 return type;
195} 195}
196 196
197static struct dentry *ocfs2_decode_fh(struct super_block *sb, u32 *fh_in, 197static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
198 int fh_len, int fileid_type, 198 struct fid *fid, int fh_len, int fh_type)
199 int (*acceptable)(void *context,
200 struct dentry *de),
201 void *context)
202{ 199{
203 struct ocfs2_inode_handle handle, parent; 200 struct ocfs2_inode_handle handle;
204 struct dentry *ret = NULL;
205 __le32 *fh = (__force __le32 *) fh_in;
206
207 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
208 sb, fh, fh_len, fileid_type, acceptable, context);
209
210 if (fh_len < 3 || fileid_type > 2)
211 goto bail;
212
213 if (fileid_type == 2) {
214 if (fh_len < 6)
215 goto bail;
216
217 parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
218 parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
219 parent.ih_generation = le32_to_cpu(fh[5]);
220 201
221 mlog(0, "Decoding parent: blkno: %llu, generation: %u\n", 202 if (fh_len < 3 || fh_type > 2)
222 (unsigned long long)parent.ih_blkno, 203 return NULL;
223 parent.ih_generation);
224 }
225 204
226 handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32; 205 handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
227 handle.ih_blkno |= (u64)le32_to_cpu(fh[1]); 206 handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
228 handle.ih_generation = le32_to_cpu(fh[2]); 207 handle.ih_generation = le32_to_cpu(fid->raw[2]);
208 return ocfs2_get_dentry(sb, &handle);
209}
229 210
230 mlog(0, "Encoding fh: blkno: %llu, generation: %u\n", 211static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
231 (unsigned long long)handle.ih_blkno, handle.ih_generation); 212 struct fid *fid, int fh_len, int fh_type)
213{
214 struct ocfs2_inode_handle parent;
232 215
233 ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent, 216 if (fh_type != 2 || fh_len < 6)
234 acceptable, context); 217 return NULL;
235 218
236bail: 219 parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
237 mlog_exit_ptr(ret); 220 parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
238 return ret; 221 parent.ih_generation = le32_to_cpu(fid->raw[5]);
222 return ocfs2_get_dentry(sb, &parent);
239} 223}
240 224
241struct export_operations ocfs2_export_ops = { 225const struct export_operations ocfs2_export_ops = {
242 .decode_fh = ocfs2_decode_fh,
243 .encode_fh = ocfs2_encode_fh, 226 .encode_fh = ocfs2_encode_fh,
244 227 .fh_to_dentry = ocfs2_fh_to_dentry,
228 .fh_to_parent = ocfs2_fh_to_parent,
245 .get_parent = ocfs2_get_parent, 229 .get_parent = ocfs2_get_parent,
246 .get_dentry = ocfs2_get_dentry,
247}; 230};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index e08bed9e45a..41a738678c3 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -28,6 +28,6 @@
28 28
29#include <linux/exportfs.h> 29#include <linux/exportfs.h>
30 30
31extern struct export_operations ocfs2_export_ops; 31extern const struct export_operations ocfs2_export_ops;
32 32
33#endif /* OCFS2_EXPORT_H */ 33#endif /* OCFS2_EXPORT_H */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a991af96f3f..231fd5ccadc 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1515,19 +1515,20 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1515 return inode; 1515 return inode;
1516} 1516}
1517 1517
1518struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp) 1518static struct dentry *reiserfs_get_dentry(struct super_block *sb,
1519 u32 objectid, u32 dir_id, u32 generation)
1520
1519{ 1521{
1520 __u32 *data = vobjp;
1521 struct cpu_key key; 1522 struct cpu_key key;
1522 struct dentry *result; 1523 struct dentry *result;
1523 struct inode *inode; 1524 struct inode *inode;
1524 1525
1525 key.on_disk_key.k_objectid = data[0]; 1526 key.on_disk_key.k_objectid = objectid;
1526 key.on_disk_key.k_dir_id = data[1]; 1527 key.on_disk_key.k_dir_id = dir_id;
1527 reiserfs_write_lock(sb); 1528 reiserfs_write_lock(sb);
1528 inode = reiserfs_iget(sb, &key); 1529 inode = reiserfs_iget(sb, &key);
1529 if (inode && !IS_ERR(inode) && data[2] != 0 && 1530 if (inode && !IS_ERR(inode) && generation != 0 &&
1530 data[2] != inode->i_generation) { 1531 generation != inode->i_generation) {
1531 iput(inode); 1532 iput(inode);
1532 inode = NULL; 1533 inode = NULL;
1533 } 1534 }
@@ -1544,14 +1545,9 @@ struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1544 return result; 1545 return result;
1545} 1546}
1546 1547
1547struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data, 1548struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1548 int len, int fhtype, 1549 int fh_len, int fh_type)
1549 int (*acceptable) (void *contect,
1550 struct dentry * de),
1551 void *context)
1552{ 1550{
1553 __u32 obj[3], parent[3];
1554
1555 /* fhtype happens to reflect the number of u32s encoded. 1551 /* fhtype happens to reflect the number of u32s encoded.
1556 * due to a bug in earlier code, fhtype might indicate there 1552 * due to a bug in earlier code, fhtype might indicate there
1557 * are more u32s then actually fitted. 1553 * are more u32s then actually fitted.
@@ -1564,32 +1560,28 @@ struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
1564 * 6 - as above plus generation of directory 1560 * 6 - as above plus generation of directory
1565 * 6 does not fit in NFSv2 handles 1561 * 6 does not fit in NFSv2 handles
1566 */ 1562 */
1567 if (fhtype > len) { 1563 if (fh_type > fh_len) {
1568 if (fhtype != 6 || len != 5) 1564 if (fh_type != 6 || fh_len != 5)
1569 reiserfs_warning(sb, 1565 reiserfs_warning(sb,
1570 "nfsd/reiserfs, fhtype=%d, len=%d - odd", 1566 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1571 fhtype, len); 1567 fh_type, fh_len);
1572 fhtype = 5; 1568 fh_type = 5;
1573 } 1569 }
1574 1570
1575 obj[0] = data[0]; 1571 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
1576 obj[1] = data[1]; 1572 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
1577 if (fhtype == 3 || fhtype >= 5) 1573}
1578 obj[2] = data[2];
1579 else
1580 obj[2] = 0; /* generation number */
1581 1574
1582 if (fhtype >= 4) { 1575struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1583 parent[0] = data[fhtype >= 5 ? 3 : 2]; 1576 int fh_len, int fh_type)
1584 parent[1] = data[fhtype >= 5 ? 4 : 3]; 1577{
1585 if (fhtype == 6) 1578 if (fh_type < 4)
1586 parent[2] = data[5]; 1579 return NULL;
1587 else 1580
1588 parent[2] = 0; 1581 return reiserfs_get_dentry(sb,
1589 } 1582 (fh_type >= 5) ? fid->raw[3] : fid->raw[2],
1590 return sb->s_export_op->find_exported_dentry(sb, obj, 1583 (fh_type >= 5) ? fid->raw[4] : fid->raw[3],
1591 fhtype < 4 ? NULL : parent, 1584 (fh_type == 6) ? fid->raw[5] : 0);
1592 acceptable, context);
1593} 1585}
1594 1586
1595int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, 1587int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 98c3781bc06..5cd85fe5df5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -661,11 +661,11 @@ static struct quotactl_ops reiserfs_qctl_operations = {
661}; 661};
662#endif 662#endif
663 663
664static struct export_operations reiserfs_export_ops = { 664static const struct export_operations reiserfs_export_ops = {
665 .encode_fh = reiserfs_encode_fh, 665 .encode_fh = reiserfs_encode_fh,
666 .decode_fh = reiserfs_decode_fh, 666 .fh_to_dentry = reiserfs_fh_to_dentry,
667 .fh_to_parent = reiserfs_fh_to_parent,
667 .get_parent = reiserfs_get_parent, 668 .get_parent = reiserfs_get_parent,
668 .get_dentry = reiserfs_get_dentry,
669}; 669};
670 670
671/* this struct is used in reiserfs_getopt () for containing the value for those 671/* this struct is used in reiserfs_getopt () for containing the value for those
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3586c7a28d2..15bd4948832 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -33,62 +33,25 @@
33static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, }; 33static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
34 34
35/* 35/*
36 * XFS encodes and decodes the fileid portion of NFS filehandles 36 * Note that we only accept fileids which are long enough rather than allow
37 * itself instead of letting the generic NFS code do it. This 37 * the parent generation number to default to zero. XFS considers zero a
38 * allows filesystems with 64 bit inode numbers to be exported. 38 * valid generation number not an invalid/wildcard value.
39 *
40 * Note that a side effect is that xfs_vget() won't be passed a
41 * zero inode/generation pair under normal circumstances. As
42 * however a malicious client could send us such data, the check
43 * remains in that code.
44 */ 39 */
45 40static int xfs_fileid_length(int fileid_type)
46STATIC struct dentry *
47xfs_fs_decode_fh(
48 struct super_block *sb,
49 __u32 *fh,
50 int fh_len,
51 int fileid_type,
52 int (*acceptable)(
53 void *context,
54 struct dentry *de),
55 void *context)
56{ 41{
57 xfs_fid_t ifid; 42 switch (fileid_type) {
58 xfs_fid_t pfid; 43 case FILEID_INO32_GEN:
59 void *parent = NULL; 44 return 2;
60 int is64 = 0; 45 case FILEID_INO32_GEN_PARENT:
61 __u32 *p = fh; 46 return 4;
62 47 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
63#if XFS_BIG_INUMS 48 return 3;
64 is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG); 49 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
65 fileid_type &= ~XFS_FILEID_TYPE_64FLAG; 50 return 6;
66#endif
67
68 /*
69 * Note that we only accept fileids which are long enough
70 * rather than allow the parent generation number to default
71 * to zero. XFS considers zero a valid generation number not
72 * an invalid/wildcard value. There's little point printk'ing
73 * a warning here as we don't have the client information
74 * which would make such a warning useful.
75 */
76 if (fileid_type > 2 ||
77 fh_len < xfs_fileid_length((fileid_type == 2), is64))
78 return NULL;
79
80 p = xfs_fileid_decode_fid2(p, &ifid, is64);
81
82 if (fileid_type == 2) {
83 p = xfs_fileid_decode_fid2(p, &pfid, is64);
84 parent = &pfid;
85 } 51 }
86 52 return 255; /* invalid */
87 fh = (__u32 *)&ifid;
88 return sb->s_export_op->find_exported_dentry(sb, fh, parent, acceptable, context);
89} 53}
90 54
91
92STATIC int 55STATIC int
93xfs_fs_encode_fh( 56xfs_fs_encode_fh(
94 struct dentry *dentry, 57 struct dentry *dentry,
@@ -96,21 +59,21 @@ xfs_fs_encode_fh(
96 int *max_len, 59 int *max_len,
97 int connectable) 60 int connectable)
98{ 61{
62 struct fid *fid = (struct fid *)fh;
63 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
99 struct inode *inode = dentry->d_inode; 64 struct inode *inode = dentry->d_inode;
100 int type = 1; 65 int fileid_type;
101 __u32 *p = fh;
102 int len; 66 int len;
103 int is64 = 0;
104#if XFS_BIG_INUMS
105 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) {
106 /* filesystem may contain 64bit inode numbers */
107 is64 = XFS_FILEID_TYPE_64FLAG;
108 }
109#endif
110 67
111 /* Directories don't need their parent encoded, they have ".." */ 68 /* Directories don't need their parent encoded, they have ".." */
112 if (S_ISDIR(inode->i_mode)) 69 if (S_ISDIR(inode->i_mode))
113 connectable = 0; 70 fileid_type = FILEID_INO32_GEN;
71 else
72 fileid_type = FILEID_INO32_GEN_PARENT;
73
74 /* filesystem may contain 64bit inode numbers */
75 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
76 fileid_type |= XFS_FILEID_TYPE_64FLAG;
114 77
115 /* 78 /*
116 * Only encode if there is enough space given. In practice 79 * Only encode if there is enough space given. In practice
@@ -118,39 +81,118 @@ xfs_fs_encode_fh(
118 * over NFSv2 with the subtree_check export option; the other 81 * over NFSv2 with the subtree_check export option; the other
119 * seven combinations work. The real answer is "don't use v2". 82 * seven combinations work. The real answer is "don't use v2".
120 */ 83 */
121 len = xfs_fileid_length(connectable, is64); 84 len = xfs_fileid_length(fileid_type);
122 if (*max_len < len) 85 if (*max_len < len)
123 return 255; 86 return 255;
124 *max_len = len; 87 *max_len = len;
125 88
126 p = xfs_fileid_encode_inode(p, inode, is64); 89 switch (fileid_type) {
127 if (connectable) { 90 case FILEID_INO32_GEN_PARENT:
128 spin_lock(&dentry->d_lock); 91 spin_lock(&dentry->d_lock);
129 p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64); 92 fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
93 fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
130 spin_unlock(&dentry->d_lock); 94 spin_unlock(&dentry->d_lock);
131 type = 2; 95 /*FALLTHRU*/
96 case FILEID_INO32_GEN:
97 fid->i32.ino = inode->i_ino;
98 fid->i32.gen = inode->i_generation;
99 break;
100 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
101 spin_lock(&dentry->d_lock);
102 fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
103 fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
104 spin_unlock(&dentry->d_lock);
105 /*FALLTHRU*/
106 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
107 fid64->ino = inode->i_ino;
108 fid64->gen = inode->i_generation;
109 break;
132 } 110 }
133 BUG_ON((p - fh) != len); 111
134 return type | is64; 112 return fileid_type;
135} 113}
136 114
137STATIC struct dentry * 115STATIC struct inode *
138xfs_fs_get_dentry( 116xfs_nfs_get_inode(
139 struct super_block *sb, 117 struct super_block *sb,
140 void *data) 118 u64 ino,
141{ 119 u32 generation)
120 {
121 xfs_fid_t xfid;
142 bhv_vnode_t *vp; 122 bhv_vnode_t *vp;
143 struct inode *inode;
144 struct dentry *result;
145 int error; 123 int error;
146 124
147 error = xfs_vget(XFS_M(sb), &vp, data); 125 xfid.fid_len = sizeof(xfs_fid_t) - sizeof(xfid.fid_len);
148 if (error || vp == NULL) 126 xfid.fid_pad = 0;
149 return ERR_PTR(-ESTALE) ; 127 xfid.fid_ino = ino;
128 xfid.fid_gen = generation;
150 129
151 inode = vn_to_inode(vp); 130 error = xfs_vget(XFS_M(sb), &vp, &xfid);
131 if (error)
132 return ERR_PTR(-error);
133
134 return vp ? vn_to_inode(vp) : NULL;
135}
136
137STATIC struct dentry *
138xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
139 int fh_len, int fileid_type)
140{
141 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
142 struct inode *inode = NULL;
143 struct dentry *result;
144
145 if (fh_len < xfs_fileid_length(fileid_type))
146 return NULL;
147
148 switch (fileid_type) {
149 case FILEID_INO32_GEN_PARENT:
150 case FILEID_INO32_GEN:
151 inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
152 break;
153 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
154 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
155 inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
156 break;
157 }
158
159 if (!inode)
160 return NULL;
161 if (IS_ERR(inode))
162 return ERR_PTR(PTR_ERR(inode));
163 result = d_alloc_anon(inode);
164 if (!result) {
165 iput(inode);
166 return ERR_PTR(-ENOMEM);
167 }
168 return result;
169}
170
171STATIC struct dentry *
172xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
173 int fh_len, int fileid_type)
174{
175 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
176 struct inode *inode = NULL;
177 struct dentry *result;
178
179 switch (fileid_type) {
180 case FILEID_INO32_GEN_PARENT:
181 inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
182 fid->i32.parent_gen);
183 break;
184 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
185 inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
186 fid64->parent_gen);
187 break;
188 }
189
190 if (!inode)
191 return NULL;
192 if (IS_ERR(inode))
193 return ERR_PTR(PTR_ERR(inode));
152 result = d_alloc_anon(inode); 194 result = d_alloc_anon(inode);
153 if (!result) { 195 if (!result) {
154 iput(inode); 196 iput(inode);
155 return ERR_PTR(-ENOMEM); 197 return ERR_PTR(-ENOMEM);
156 } 198 }
@@ -178,9 +220,9 @@ xfs_fs_get_parent(
178 return parent; 220 return parent;
179} 221}
180 222
181struct export_operations xfs_export_operations = { 223const struct export_operations xfs_export_operations = {
182 .decode_fh = xfs_fs_decode_fh,
183 .encode_fh = xfs_fs_encode_fh, 224 .encode_fh = xfs_fs_encode_fh,
225 .fh_to_dentry = xfs_fs_fh_to_dentry,
226 .fh_to_parent = xfs_fs_fh_to_parent,
184 .get_parent = xfs_fs_get_parent, 227 .get_parent = xfs_fs_get_parent,
185 .get_dentry = xfs_fs_get_dentry,
186}; 228};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h
index 2f36071a86f..3272b6ae7a3 100644
--- a/fs/xfs/linux-2.6/xfs_export.h
+++ b/fs/xfs/linux-2.6/xfs_export.h
@@ -59,50 +59,14 @@
59 * a subdirectory) or use the "fsid" export option. 59 * a subdirectory) or use the "fsid" export option.
60 */ 60 */
61 61
62struct xfs_fid64 {
63 u64 ino;
64 u32 gen;
65 u64 parent_ino;
66 u32 parent_gen;
67} __attribute__((packed));
68
62/* This flag goes on the wire. Don't play with it. */ 69/* This flag goes on the wire. Don't play with it. */
63#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ 70#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */
64 71
65/* Calculate the length in u32 units of the fileid data */
66static inline int
67xfs_fileid_length(int hasparent, int is64)
68{
69 return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2);
70}
71
72/*
73 * Decode encoded inode information (either for the inode itself
74 * or the parent) into an xfs_fid_t structure. Advances and
75 * returns the new data pointer
76 */
77static inline __u32 *
78xfs_fileid_decode_fid2(__u32 *p, xfs_fid_t *fid, int is64)
79{
80 fid->fid_len = sizeof(xfs_fid_t) - sizeof(fid->fid_len);
81 fid->fid_pad = 0;
82 fid->fid_ino = *p++;
83#if XFS_BIG_INUMS
84 if (is64)
85 fid->fid_ino |= (((__u64)(*p++)) << 32);
86#endif
87 fid->fid_gen = *p++;
88 return p;
89}
90
91/*
92 * Encode inode information (either for the inode itself or the
93 * parent) into a fileid buffer. Advances and returns the new
94 * data pointer.
95 */
96static inline __u32 *
97xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64)
98{
99 *p++ = (__u32)inode->i_ino;
100#if XFS_BIG_INUMS
101 if (is64)
102 *p++ = (__u32)(inode->i_ino >> 32);
103#endif
104 *p++ = inode->i_generation;
105 return p;
106}
107
108#endif /* __XFS_EXPORT_H__ */ 72#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index c78c23310fe..3efcf45b14a 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -118,7 +118,7 @@ extern int xfs_blkdev_get(struct xfs_mount *, const char *,
118extern void xfs_blkdev_put(struct block_device *); 118extern void xfs_blkdev_put(struct block_device *);
119extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 119extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
120 120
121extern struct export_operations xfs_export_operations; 121extern const struct export_operations xfs_export_operations;
122 122
123#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 123#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
124 124
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 4e5d3ca53a8..a1b1b2ee3e5 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -257,7 +257,8 @@ struct acpi_table_dbgp {
257struct acpi_table_dmar { 257struct acpi_table_dmar {
258 struct acpi_table_header header; /* Common ACPI table header */ 258 struct acpi_table_header header; /* Common ACPI table header */
259 u8 width; /* Host Address Width */ 259 u8 width; /* Host Address Width */
260 u8 reserved[11]; 260 u8 flags;
261 u8 reserved[10];
261}; 262};
262 263
263/* DMAR subtable header */ 264/* DMAR subtable header */
@@ -265,8 +266,6 @@ struct acpi_table_dmar {
265struct acpi_dmar_header { 266struct acpi_dmar_header {
266 u16 type; 267 u16 type;
267 u16 length; 268 u16 length;
268 u8 flags;
269 u8 reserved[3];
270}; 269};
271 270
272/* Values for subtable type in struct acpi_dmar_header */ 271/* Values for subtable type in struct acpi_dmar_header */
@@ -274,13 +273,15 @@ struct acpi_dmar_header {
274enum acpi_dmar_type { 273enum acpi_dmar_type {
275 ACPI_DMAR_TYPE_HARDWARE_UNIT = 0, 274 ACPI_DMAR_TYPE_HARDWARE_UNIT = 0,
276 ACPI_DMAR_TYPE_RESERVED_MEMORY = 1, 275 ACPI_DMAR_TYPE_RESERVED_MEMORY = 1,
277 ACPI_DMAR_TYPE_RESERVED = 2 /* 2 and greater are reserved */ 276 ACPI_DMAR_TYPE_ATSR = 2,
277 ACPI_DMAR_TYPE_RESERVED = 3 /* 3 and greater are reserved */
278}; 278};
279 279
280struct acpi_dmar_device_scope { 280struct acpi_dmar_device_scope {
281 u8 entry_type; 281 u8 entry_type;
282 u8 length; 282 u8 length;
283 u8 segment; 283 u16 reserved;
284 u8 enumeration_id;
284 u8 bus; 285 u8 bus;
285}; 286};
286 287
@@ -290,7 +291,14 @@ enum acpi_dmar_scope_type {
290 ACPI_DMAR_SCOPE_TYPE_NOT_USED = 0, 291 ACPI_DMAR_SCOPE_TYPE_NOT_USED = 0,
291 ACPI_DMAR_SCOPE_TYPE_ENDPOINT = 1, 292 ACPI_DMAR_SCOPE_TYPE_ENDPOINT = 1,
292 ACPI_DMAR_SCOPE_TYPE_BRIDGE = 2, 293 ACPI_DMAR_SCOPE_TYPE_BRIDGE = 2,
293 ACPI_DMAR_SCOPE_TYPE_RESERVED = 3 /* 3 and greater are reserved */ 294 ACPI_DMAR_SCOPE_TYPE_IOAPIC = 3,
295 ACPI_DMAR_SCOPE_TYPE_HPET = 4,
296 ACPI_DMAR_SCOPE_TYPE_RESERVED = 5 /* 5 and greater are reserved */
297};
298
299struct acpi_dmar_pci_path {
300 u8 dev;
301 u8 fn;
294}; 302};
295 303
296/* 304/*
@@ -301,6 +309,9 @@ enum acpi_dmar_scope_type {
301 309
302struct acpi_dmar_hardware_unit { 310struct acpi_dmar_hardware_unit {
303 struct acpi_dmar_header header; 311 struct acpi_dmar_header header;
312 u8 flags;
313 u8 reserved;
314 u16 segment;
304 u64 address; /* Register Base Address */ 315 u64 address; /* Register Base Address */
305}; 316};
306 317
@@ -312,7 +323,9 @@ struct acpi_dmar_hardware_unit {
312 323
313struct acpi_dmar_reserved_memory { 324struct acpi_dmar_reserved_memory {
314 struct acpi_dmar_header header; 325 struct acpi_dmar_header header;
315 u64 address; /* 4_k aligned base address */ 326 u16 reserved;
327 u16 segment;
328 u64 base_address; /* 4_k aligned base address */
316 u64 end_address; /* 4_k aligned limit address */ 329 u64 end_address; /* 4_k aligned limit address */
317}; 330};
318 331
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index ef67b59dbdb..dc031cf4463 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -28,8 +28,9 @@ struct setup_header {
28 u16 kernel_version; 28 u16 kernel_version;
29 u8 type_of_loader; 29 u8 type_of_loader;
30 u8 loadflags; 30 u8 loadflags;
31#define LOADED_HIGH 0x01 31#define LOADED_HIGH (1<<0)
32#define CAN_USE_HEAP 0x80 32#define KEEP_SEGMENTS (1<<6)
33#define CAN_USE_HEAP (1<<7)
33 u16 setup_move_size; 34 u16 setup_move_size;
34 u32 code32_start; 35 u32 code32_start;
35 u32 ramdisk_image; 36 u32 ramdisk_image;
@@ -41,6 +42,10 @@ struct setup_header {
41 u32 initrd_addr_max; 42 u32 initrd_addr_max;
42 u32 kernel_alignment; 43 u32 kernel_alignment;
43 u8 relocatable_kernel; 44 u8 relocatable_kernel;
45 u8 _pad2[3];
46 u32 cmdline_size;
47 u32 hardware_subarch;
48 u64 hardware_subarch_data;
44} __attribute__((packed)); 49} __attribute__((packed));
45 50
46struct sys_desc_table { 51struct sys_desc_table {
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index b3d43de44c5..9411a2d3f19 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -27,6 +27,7 @@
27void global_flush_tlb(void); 27void global_flush_tlb(void);
28int change_page_attr(struct page *page, int numpages, pgprot_t prot); 28int change_page_attr(struct page *page, int numpages, pgprot_t prot);
29int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot); 29int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot);
30void clflush_cache_range(void *addr, int size);
30 31
31#ifdef CONFIG_DEBUG_PAGEALLOC 32#ifdef CONFIG_DEBUG_PAGEALLOC
32/* internal debugging function */ 33/* internal debugging function */
diff --git a/include/asm-x86/device.h b/include/asm-x86/device.h
index d9ee5e52e91..87a715367a1 100644
--- a/include/asm-x86/device.h
+++ b/include/asm-x86/device.h
@@ -5,6 +5,9 @@ struct dev_archdata {
5#ifdef CONFIG_ACPI 5#ifdef CONFIG_ACPI
6 void *acpi_handle; 6 void *acpi_handle;
7#endif 7#endif
8#ifdef CONFIG_DMAR
9 void *iommu; /* hook for IOMMU specific extension */
10#endif
8}; 11};
9 12
10#endif /* _ASM_X86_DEVICE_H */ 13#endif /* _ASM_X86_DEVICE_H */
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7a8d7ade28a..bb017edffd5 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -56,10 +56,8 @@ typedef struct __user_cap_data_struct {
56 56
57struct vfs_cap_data { 57struct vfs_cap_data {
58 __u32 magic_etc; /* Little endian */ 58 __u32 magic_etc; /* Little endian */
59 struct { 59 __u32 permitted; /* Little endian */
60 __u32 permitted; /* Little endian */ 60 __u32 inheritable; /* Little endian */
61 __u32 inheritable; /* Little endian */
62 } data[1];
63}; 61};
64 62
65#ifdef __KERNEL__ 63#ifdef __KERNEL__
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
new file mode 100644
index 00000000000..ffb6439cb5e
--- /dev/null
+++ b/include/linux/dmar.h
@@ -0,0 +1,86 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 */
20
21#ifndef __DMAR_H__
22#define __DMAR_H__
23
24#include <linux/acpi.h>
25#include <linux/types.h>
26#include <linux/msi.h>
27
28#ifdef CONFIG_DMAR
29struct intel_iommu;
30
31extern char *dmar_get_fault_reason(u8 fault_reason);
32
33/* Can't use the common MSI interrupt functions
34 * since DMAR is not a pci device
35 */
36extern void dmar_msi_unmask(unsigned int irq);
37extern void dmar_msi_mask(unsigned int irq);
38extern void dmar_msi_read(int irq, struct msi_msg *msg);
39extern void dmar_msi_write(int irq, struct msi_msg *msg);
40extern int dmar_set_interrupt(struct intel_iommu *iommu);
41extern int arch_setup_dmar_msi(unsigned int irq);
42
43/* Intel IOMMU detection and initialization functions */
44extern void detect_intel_iommu(void);
45extern int intel_iommu_init(void);
46
47extern int dmar_table_init(void);
48extern int early_dmar_detect(void);
49
50extern struct list_head dmar_drhd_units;
51extern struct list_head dmar_rmrr_units;
52
53struct dmar_drhd_unit {
54 struct list_head list; /* list of drhd units */
55 u64 reg_base_addr; /* register base address*/
56 struct pci_dev **devices; /* target device array */
57 int devices_cnt; /* target device count */
58 u8 ignored:1; /* ignore drhd */
59 u8 include_all:1;
60 struct intel_iommu *iommu;
61};
62
63struct dmar_rmrr_unit {
64 struct list_head list; /* list of rmrr units */
65 u64 base_address; /* reserved base address*/
66 u64 end_address; /* reserved end address */
67 struct pci_dev **devices; /* target devices */
68 int devices_cnt; /* target device count */
69};
70
71#define for_each_drhd_unit(drhd) \
72 list_for_each_entry(drhd, &dmar_drhd_units, list)
73#define for_each_rmrr_units(rmrr) \
74 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
75#else
76static inline void detect_intel_iommu(void)
77{
78 return;
79}
80static inline int intel_iommu_init(void)
81{
82 return -ENODEV;
83}
84
85#endif /* !CONFIG_DMAR */
86#endif /* __DMAR_H__ */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0b9579a4cd4..14813b59580 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -298,7 +298,7 @@ extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size,
298 u64 attr); 298 u64 attr);
299extern int __init efi_uart_console_only (void); 299extern int __init efi_uart_console_only (void);
300extern void efi_initialize_iomem_resources(struct resource *code_resource, 300extern void efi_initialize_iomem_resources(struct resource *code_resource,
301 struct resource *data_resource); 301 struct resource *data_resource, struct resource *bss_resource);
302extern unsigned long efi_get_time(void); 302extern unsigned long efi_get_time(void);
303extern int efi_set_rtc_mmss(unsigned long nowtime); 303extern int efi_set_rtc_mmss(unsigned long nowtime);
304extern int is_available_memory(efi_memory_desc_t * md); 304extern int is_available_memory(efi_memory_desc_t * md);
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index 16cb25cbf7c..dd57fe523e9 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -35,6 +35,7 @@ static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb)
35} 35}
36 36
37struct statfs; 37struct statfs;
38struct fid;
38 39
39extern const struct inode_operations efs_dir_inode_operations; 40extern const struct inode_operations efs_dir_inode_operations;
40extern const struct file_operations efs_dir_operations; 41extern const struct file_operations efs_dir_operations;
@@ -45,7 +46,10 @@ extern efs_block_t efs_map_block(struct inode *, efs_block_t);
45extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int); 46extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int);
46 47
47extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *); 48extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *);
48extern struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp); 49extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
50 int fh_len, int fh_type);
51extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
52 int fh_len, int fh_type);
49extern struct dentry *efs_get_parent(struct dentry *); 53extern struct dentry *efs_get_parent(struct dentry *);
50extern int efs_bmap(struct inode *, int); 54extern int efs_bmap(struct inode *, int);
51 55
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 8872fe8392d..51d21413881 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,9 +4,48 @@
4#include <linux/types.h> 4#include <linux/types.h>
5 5
6struct dentry; 6struct dentry;
7struct inode;
7struct super_block; 8struct super_block;
8struct vfsmount; 9struct vfsmount;
9 10
11/*
12 * The fileid_type identifies how the file within the filesystem is encoded.
13 * In theory this is freely set and parsed by the filesystem, but we try to
14 * stick to conventions so we can share some generic code and don't confuse
15 * sniffers like ethereal/wireshark.
16 *
17 * The filesystem must not use the value '0' or '0xff'.
18 */
19enum fid_type {
20 /*
21 * The root, or export point, of the filesystem.
22 * (Never actually passed down to the filesystem.
23 */
24 FILEID_ROOT = 0,
25
26 /*
27 * 32bit inode number, 32 bit generation number.
28 */
29 FILEID_INO32_GEN = 1,
30
31 /*
32 * 32bit inode number, 32 bit generation number,
33 * 32 bit parent directory inode number.
34 */
35 FILEID_INO32_GEN_PARENT = 2,
36};
37
38struct fid {
39 union {
40 struct {
41 u32 ino;
42 u32 gen;
43 u32 parent_ino;
44 u32 parent_gen;
45 } i32;
46 __u32 raw[6];
47 };
48};
10 49
11/** 50/**
12 * struct export_operations - for nfsd to communicate with file systems 51 * struct export_operations - for nfsd to communicate with file systems
@@ -15,43 +54,9 @@ struct vfsmount;
15 * @get_name: find the name for a given inode in a given directory 54 * @get_name: find the name for a given inode in a given directory
16 * @get_parent: find the parent of a given directory 55 * @get_parent: find the parent of a given directory
17 * @get_dentry: find a dentry for the inode given a file handle sub-fragment 56 * @get_dentry: find a dentry for the inode given a file handle sub-fragment
18 * @find_exported_dentry:
19 * set by the exporting module to a standard helper function.
20 *
21 * Description:
22 * The export_operations structure provides a means for nfsd to communicate
23 * with a particular exported file system - particularly enabling nfsd and
24 * the filesystem to co-operate when dealing with file handles.
25 *
26 * export_operations contains two basic operation for dealing with file
27 * handles, decode_fh() and encode_fh(), and allows for some other
28 * operations to be defined which standard helper routines use to get
29 * specific information from the filesystem.
30 *
31 * nfsd encodes information use to determine which filesystem a filehandle
32 * applies to in the initial part of the file handle. The remainder, termed
33 * a file handle fragment, is controlled completely by the filesystem. The
34 * standard helper routines assume that this fragment will contain one or
35 * two sub-fragments, one which identifies the file, and one which may be
36 * used to identify the (a) directory containing the file.
37 * 57 *
38 * In some situations, nfsd needs to get a dentry which is connected into a 58 * See Documentation/filesystems/Exporting for details on how to use
39 * specific part of the file tree. To allow for this, it passes the 59 * this interface correctly.
40 * function acceptable() together with a @context which can be used to see
41 * if the dentry is acceptable. As there can be multiple dentrys for a
42 * given file, the filesystem should check each one for acceptability before
43 * looking for the next. As soon as an acceptable one is found, it should
44 * be returned.
45 *
46 * decode_fh:
47 * @decode_fh is given a &struct super_block (@sb), a file handle fragment
48 * (@fh, @fh_len) and an acceptability testing function (@acceptable,
49 * @context). It should return a &struct dentry which refers to the same
50 * file that the file handle fragment refers to, and which passes the
51 * acceptability test. If it cannot, it should return a %NULL pointer if
52 * the file was found but no acceptable &dentries were available, or a
53 * %ERR_PTR error code indicating why it couldn't be found (e.g. %ENOENT or
54 * %ENOMEM).
55 * 60 *
56 * encode_fh: 61 * encode_fh:
57 * @encode_fh should store in the file handle fragment @fh (using at most 62 * @encode_fh should store in the file handle fragment @fh (using at most
@@ -63,6 +68,21 @@ struct vfsmount;
63 * the filehandle fragment. encode_fh() should return the number of bytes 68 * the filehandle fragment. encode_fh() should return the number of bytes
64 * stored or a negative error code such as %-ENOSPC 69 * stored or a negative error code such as %-ENOSPC
65 * 70 *
71 * fh_to_dentry:
72 * @fh_to_dentry is given a &struct super_block (@sb) and a file handle
73 * fragment (@fh, @fh_len). It should return a &struct dentry which refers
74 * to the same file that the file handle fragment refers to. If it cannot,
75 * it should return a %NULL pointer if the file was found but no acceptable
76 * &dentries were available, or an %ERR_PTR error code indicating why it
77 * couldn't be found (e.g. %ENOENT or %ENOMEM). Any suitable dentry can be
78 * returned including, if necessary, a new dentry created with d_alloc_root.
79 * The caller can then find any other extant dentries by following the
80 * d_alias links.
81 *
82 * fh_to_parent:
83 * Same as @fh_to_dentry, except that it returns a pointer to the parent
84 * dentry if it was encoded into the filehandle fragment by @encode_fh.
85 *
66 * get_name: 86 * get_name:
67 * @get_name should find a name for the given @child in the given @parent 87 * @get_name should find a name for the given @child in the given @parent
68 * directory. The name should be stored in the @name (with the 88 * directory. The name should be stored in the @name (with the
@@ -75,52 +95,37 @@ struct vfsmount;
75 * is also a directory. In the event that it cannot be found, or storage 95 * is also a directory. In the event that it cannot be found, or storage
76 * space cannot be allocated, a %ERR_PTR should be returned. 96 * space cannot be allocated, a %ERR_PTR should be returned.
77 * 97 *
78 * get_dentry:
79 * Given a &super_block (@sb) and a pointer to a file-system specific inode
80 * identifier, possibly an inode number, (@inump) get_dentry() should find
81 * the identified inode and return a dentry for that inode. Any suitable
82 * dentry can be returned including, if necessary, a new dentry created with
83 * d_alloc_root. The caller can then find any other extant dentrys by
84 * following the d_alias links. If a new dentry was created using
85 * d_alloc_root, DCACHE_NFSD_DISCONNECTED should be set, and the dentry
86 * should be d_rehash()ed.
87 *
88 * If the inode cannot be found, either a %NULL pointer or an %ERR_PTR code
89 * can be returned. The @inump will be whatever was passed to
90 * nfsd_find_fh_dentry() in either the @obj or @parent parameters.
91 *
92 * Locking rules: 98 * Locking rules:
93 * get_parent is called with child->d_inode->i_mutex down 99 * get_parent is called with child->d_inode->i_mutex down
94 * get_name is not (which is possibly inconsistent) 100 * get_name is not (which is possibly inconsistent)
95 */ 101 */
96 102
97struct export_operations { 103struct export_operations {
98 struct dentry *(*decode_fh)(struct super_block *sb, __u32 *fh,
99 int fh_len, int fh_type,
100 int (*acceptable)(void *context, struct dentry *de),
101 void *context);
102 int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len, 104 int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len,
103 int connectable); 105 int connectable);
106 struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid,
107 int fh_len, int fh_type);
108 struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid,
109 int fh_len, int fh_type);
104 int (*get_name)(struct dentry *parent, char *name, 110 int (*get_name)(struct dentry *parent, char *name,
105 struct dentry *child); 111 struct dentry *child);
106 struct dentry * (*get_parent)(struct dentry *child); 112 struct dentry * (*get_parent)(struct dentry *child);
107 struct dentry * (*get_dentry)(struct super_block *sb, void *inump);
108
109 /* This is set by the exporting module to a standard helper */
110 struct dentry * (*find_exported_dentry)(
111 struct super_block *sb, void *obj, void *parent,
112 int (*acceptable)(void *context, struct dentry *de),
113 void *context);
114}; 113};
115 114
116extern struct dentry *find_exported_dentry(struct super_block *sb, void *obj, 115extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
117 void *parent, int (*acceptable)(void *context, struct dentry *de), 116 int *max_len, int connectable);
118 void *context); 117extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
119
120extern int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
121 int connectable);
122extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh,
123 int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), 118 int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *),
124 void *context); 119 void *context);
125 120
121/*
122 * Generic helpers for filesystems.
123 */
124extern struct dentry *generic_fh_to_dentry(struct super_block *sb,
125 struct fid *fid, int fh_len, int fh_type,
126 struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
127extern struct dentry *generic_fh_to_parent(struct super_block *sb,
128 struct fid *fid, int fh_len, int fh_type,
129 struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
130
126#endif /* LINUX_EXPORTFS_H */ 131#endif /* LINUX_EXPORTFS_H */
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index c77c3bbfe4b..0f6c86c634f 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -561,6 +561,7 @@ enum {
561#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1) 561#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1)
562#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \ 562#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \
563 ~EXT2_DIR_ROUND) 563 ~EXT2_DIR_ROUND)
564#define EXT2_MAX_REC_LEN ((1<<16)-1)
564 565
565static inline ext2_fsblk_t 566static inline ext2_fsblk_t
566ext2_group_first_block_no(struct super_block *sb, unsigned long group_no) 567ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 50078bb30a1..b3ec4a496d6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -987,7 +987,7 @@ struct super_block {
987 const struct super_operations *s_op; 987 const struct super_operations *s_op;
988 struct dquot_operations *dq_op; 988 struct dquot_operations *dq_op;
989 struct quotactl_ops *s_qcop; 989 struct quotactl_ops *s_qcop;
990 struct export_operations *s_export_op; 990 const struct export_operations *s_export_op;
991 unsigned long s_flags; 991 unsigned long s_flags;
992 unsigned long s_magic; 992 unsigned long s_magic;
993 struct dentry *s_root; 993 struct dentry *s_root;
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 6c9873f8828..ff203dd0291 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -34,6 +34,12 @@
34 name: 34 name:
35#endif 35#endif
36 36
37#ifndef WEAK
38#define WEAK(name) \
39 .weak name; \
40 name:
41#endif
42
37#define KPROBE_ENTRY(name) \ 43#define KPROBE_ENTRY(name) \
38 .pushsection .kprobes.text, "ax"; \ 44 .pushsection .kprobes.text, "ax"; \
39 ENTRY(name) 45 ENTRY(name)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 654ef554487..33f0ff0cf63 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -41,18 +41,15 @@ struct memory_block {
41#define MEM_ONLINE (1<<0) /* exposed to userspace */ 41#define MEM_ONLINE (1<<0) /* exposed to userspace */
42#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ 42#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
43#define MEM_OFFLINE (1<<2) /* exposed to userspace */ 43#define MEM_OFFLINE (1<<2) /* exposed to userspace */
44#define MEM_GOING_ONLINE (1<<3)
45#define MEM_CANCEL_ONLINE (1<<4)
46#define MEM_CANCEL_OFFLINE (1<<5)
44 47
45/* 48struct memory_notify {
46 * All of these states are currently kernel-internal for notifying 49 unsigned long start_pfn;
47 * kernel components and architectures. 50 unsigned long nr_pages;
48 * 51 int status_change_nid;
49 * For MEM_MAPPING_INVALID, all notifier chains with priority >0 52};
50 * are called before pfn_to_page() becomes invalid. The priority=0
51 * entry is reserved for the function that actually makes
52 * pfn_to_page() stop working. Any notifiers that want to be called
53 * after that should have priority <0.
54 */
55#define MEM_MAPPING_INVALID (1<<3)
56 53
57struct notifier_block; 54struct notifier_block;
58struct mem_section; 55struct mem_section;
@@ -69,21 +66,31 @@ static inline int register_memory_notifier(struct notifier_block *nb)
69static inline void unregister_memory_notifier(struct notifier_block *nb) 66static inline void unregister_memory_notifier(struct notifier_block *nb)
70{ 67{
71} 68}
69static inline int memory_notify(unsigned long val, void *v)
70{
71 return 0;
72}
72#else 73#else
74extern int register_memory_notifier(struct notifier_block *nb);
75extern void unregister_memory_notifier(struct notifier_block *nb);
73extern int register_new_memory(struct mem_section *); 76extern int register_new_memory(struct mem_section *);
74extern int unregister_memory_section(struct mem_section *); 77extern int unregister_memory_section(struct mem_section *);
75extern int memory_dev_init(void); 78extern int memory_dev_init(void);
76extern int remove_memory_block(unsigned long, struct mem_section *, int); 79extern int remove_memory_block(unsigned long, struct mem_section *, int);
77 80extern int memory_notify(unsigned long val, void *v);
78#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) 81#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
79 82
80 83
81#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 84#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
82 85
86#ifdef CONFIG_MEMORY_HOTPLUG
83#define hotplug_memory_notifier(fn, pri) { \ 87#define hotplug_memory_notifier(fn, pri) { \
84 static struct notifier_block fn##_mem_nb = \ 88 static struct notifier_block fn##_mem_nb = \
85 { .notifier_call = fn, .priority = pri }; \ 89 { .notifier_call = fn, .priority = pri }; \
86 register_memory_notifier(&fn##_mem_nb); \ 90 register_memory_notifier(&fn##_mem_nb); \
87} 91}
92#else
93#define hotplug_memory_notifier(fn, pri) do { } while (0)
94#endif
88 95
89#endif /* _LINUX_MEMORY_H_ */ 96#endif /* _LINUX_MEMORY_H_ */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 768b93359f9..5d2281f661f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -141,6 +141,7 @@ struct pci_dev {
141 unsigned int class; /* 3 bytes: (base,sub,prog-if) */ 141 unsigned int class; /* 3 bytes: (base,sub,prog-if) */
142 u8 revision; /* PCI revision, low byte of class word */ 142 u8 revision; /* PCI revision, low byte of class word */
143 u8 hdr_type; /* PCI header type (`multi' flag masked out) */ 143 u8 hdr_type; /* PCI header type (`multi' flag masked out) */
144 u8 pcie_type; /* PCI-E device/port type */
144 u8 rom_base_reg; /* which config register controls the ROM */ 145 u8 rom_base_reg; /* which config register controls the ROM */
145 u8 pin; /* which interrupt pin this device uses */ 146 u8 pin; /* which interrupt pin this device uses */
146 147
@@ -183,6 +184,7 @@ struct pci_dev {
183 unsigned int msi_enabled:1; 184 unsigned int msi_enabled:1;
184 unsigned int msix_enabled:1; 185 unsigned int msix_enabled:1;
185 unsigned int is_managed:1; 186 unsigned int is_managed:1;
187 unsigned int is_pcie:1;
186 atomic_t enable_cnt; /* pci_enable_device has been called */ 188 atomic_t enable_cnt; /* pci_enable_device has been called */
187 189
188 u32 saved_config_space[16]; /* config space saved at suspend time */ 190 u32 saved_config_space[16]; /* config space saved at suspend time */
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 72bfccd3da2..422eab4958a 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -28,6 +28,8 @@
28#include <linux/reiserfs_fs_sb.h> 28#include <linux/reiserfs_fs_sb.h>
29#endif 29#endif
30 30
31struct fid;
32
31/* 33/*
32 * include/linux/reiser_fs.h 34 * include/linux/reiser_fs.h
33 * 35 *
@@ -1877,12 +1879,10 @@ void reiserfs_delete_inode(struct inode *inode);
1877int reiserfs_write_inode(struct inode *inode, int); 1879int reiserfs_write_inode(struct inode *inode, int);
1878int reiserfs_get_block(struct inode *inode, sector_t block, 1880int reiserfs_get_block(struct inode *inode, sector_t block,
1879 struct buffer_head *bh_result, int create); 1881 struct buffer_head *bh_result, int create);
1880struct dentry *reiserfs_get_dentry(struct super_block *, void *); 1882struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1881struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data, 1883 int fh_len, int fh_type);
1882 int len, int fhtype, 1884struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1883 int (*acceptable) (void *contect, 1885 int fh_len, int fh_type);
1884 struct dentry * de),
1885 void *context);
1886int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, 1886int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1887 int connectable); 1887 int connectable);
1888 1888
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1833879f843..3a47871a29d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -187,7 +187,24 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
187 unsigned long onlined_pages = 0; 187 unsigned long onlined_pages = 0;
188 struct zone *zone; 188 struct zone *zone;
189 int need_zonelists_rebuild = 0; 189 int need_zonelists_rebuild = 0;
190 int nid;
191 int ret;
192 struct memory_notify arg;
193
194 arg.start_pfn = pfn;
195 arg.nr_pages = nr_pages;
196 arg.status_change_nid = -1;
197
198 nid = page_to_nid(pfn_to_page(pfn));
199 if (node_present_pages(nid) == 0)
200 arg.status_change_nid = nid;
190 201
202 ret = memory_notify(MEM_GOING_ONLINE, &arg);
203 ret = notifier_to_errno(ret);
204 if (ret) {
205 memory_notify(MEM_CANCEL_ONLINE, &arg);
206 return ret;
207 }
191 /* 208 /*
192 * This doesn't need a lock to do pfn_to_page(). 209 * This doesn't need a lock to do pfn_to_page().
193 * The section can't be removed here because of the 210 * The section can't be removed here because of the
@@ -222,6 +239,10 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
222 build_all_zonelists(); 239 build_all_zonelists();
223 vm_total_pages = nr_free_pagecache_pages(); 240 vm_total_pages = nr_free_pagecache_pages();
224 writeback_set_ratelimit(); 241 writeback_set_ratelimit();
242
243 if (onlined_pages)
244 memory_notify(MEM_ONLINE, &arg);
245
225 return 0; 246 return 0;
226} 247}
227#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 248#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -467,8 +488,9 @@ int offline_pages(unsigned long start_pfn,
467{ 488{
468 unsigned long pfn, nr_pages, expire; 489 unsigned long pfn, nr_pages, expire;
469 long offlined_pages; 490 long offlined_pages;
470 int ret, drain, retry_max; 491 int ret, drain, retry_max, node;
471 struct zone *zone; 492 struct zone *zone;
493 struct memory_notify arg;
472 494
473 BUG_ON(start_pfn >= end_pfn); 495 BUG_ON(start_pfn >= end_pfn);
474 /* at least, alignment against pageblock is necessary */ 496 /* at least, alignment against pageblock is necessary */
@@ -480,11 +502,27 @@ int offline_pages(unsigned long start_pfn,
480 we assume this for now. .*/ 502 we assume this for now. .*/
481 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 503 if (!test_pages_in_a_zone(start_pfn, end_pfn))
482 return -EINVAL; 504 return -EINVAL;
505
506 zone = page_zone(pfn_to_page(start_pfn));
507 node = zone_to_nid(zone);
508 nr_pages = end_pfn - start_pfn;
509
483 /* set above range as isolated */ 510 /* set above range as isolated */
484 ret = start_isolate_page_range(start_pfn, end_pfn); 511 ret = start_isolate_page_range(start_pfn, end_pfn);
485 if (ret) 512 if (ret)
486 return ret; 513 return ret;
487 nr_pages = end_pfn - start_pfn; 514
515 arg.start_pfn = start_pfn;
516 arg.nr_pages = nr_pages;
517 arg.status_change_nid = -1;
518 if (nr_pages >= node_present_pages(node))
519 arg.status_change_nid = node;
520
521 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
522 ret = notifier_to_errno(ret);
523 if (ret)
524 goto failed_removal;
525
488 pfn = start_pfn; 526 pfn = start_pfn;
489 expire = jiffies + timeout; 527 expire = jiffies + timeout;
490 drain = 0; 528 drain = 0;
@@ -539,20 +577,24 @@ repeat:
539 /* reset pagetype flags */ 577 /* reset pagetype flags */
540 start_isolate_page_range(start_pfn, end_pfn); 578 start_isolate_page_range(start_pfn, end_pfn);
541 /* removal success */ 579 /* removal success */
542 zone = page_zone(pfn_to_page(start_pfn));
543 zone->present_pages -= offlined_pages; 580 zone->present_pages -= offlined_pages;
544 zone->zone_pgdat->node_present_pages -= offlined_pages; 581 zone->zone_pgdat->node_present_pages -= offlined_pages;
545 totalram_pages -= offlined_pages; 582 totalram_pages -= offlined_pages;
546 num_physpages -= offlined_pages; 583 num_physpages -= offlined_pages;
584
547 vm_total_pages = nr_free_pagecache_pages(); 585 vm_total_pages = nr_free_pagecache_pages();
548 writeback_set_ratelimit(); 586 writeback_set_ratelimit();
587
588 memory_notify(MEM_OFFLINE, &arg);
549 return 0; 589 return 0;
550 590
551failed_removal: 591failed_removal:
552 printk(KERN_INFO "memory offlining %lx to %lx failed\n", 592 printk(KERN_INFO "memory offlining %lx to %lx failed\n",
553 start_pfn, end_pfn); 593 start_pfn, end_pfn);
594 memory_notify(MEM_CANCEL_OFFLINE, &arg);
554 /* pushback to free area */ 595 /* pushback to free area */
555 undo_isolate_page_range(start_pfn, end_pfn); 596 undo_isolate_page_range(start_pfn, end_pfn);
597
556 return ret; 598 return ret;
557} 599}
558#else 600#else
diff --git a/mm/shmem.c b/mm/shmem.c
index 289dbb0a6fd..404e53bb212 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2020,33 +2020,25 @@ static int shmem_match(struct inode *ino, void *vfh)
2020 return ino->i_ino == inum && fh[0] == ino->i_generation; 2020 return ino->i_ino == inum && fh[0] == ino->i_generation;
2021} 2021}
2022 2022
2023static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh) 2023static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2024 struct fid *fid, int fh_len, int fh_type)
2024{ 2025{
2025 struct dentry *de = NULL;
2026 struct inode *inode; 2026 struct inode *inode;
2027 __u32 *fh = vfh; 2027 struct dentry *dentry = NULL;
2028 __u64 inum = fh[2]; 2028 u64 inum = fid->raw[2];
2029 inum = (inum << 32) | fh[1]; 2029 inum = (inum << 32) | fid->raw[1];
2030
2031 if (fh_len < 3)
2032 return NULL;
2030 2033
2031 inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh); 2034 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2035 shmem_match, fid->raw);
2032 if (inode) { 2036 if (inode) {
2033 de = d_find_alias(inode); 2037 dentry = d_find_alias(inode);
2034 iput(inode); 2038 iput(inode);
2035 } 2039 }
2036 2040
2037 return de? de: ERR_PTR(-ESTALE); 2041 return dentry;
2038}
2039
2040static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
2041 int len, int type,
2042 int (*acceptable)(void *context, struct dentry *de),
2043 void *context)
2044{
2045 if (len < 3)
2046 return ERR_PTR(-ESTALE);
2047
2048 return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
2049 context);
2050} 2042}
2051 2043
2052static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 2044static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
@@ -2079,11 +2071,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2079 return 1; 2071 return 1;
2080} 2072}
2081 2073
2082static struct export_operations shmem_export_ops = { 2074static const struct export_operations shmem_export_ops = {
2083 .get_parent = shmem_get_parent, 2075 .get_parent = shmem_get_parent,
2084 .get_dentry = shmem_get_dentry,
2085 .encode_fh = shmem_encode_fh, 2076 .encode_fh = shmem_encode_fh,
2086 .decode_fh = shmem_decode_fh, 2077 .fh_to_dentry = shmem_fh_to_dentry,
2087}; 2078};
2088 2079
2089static int shmem_parse_options(char *options, int *mode, uid_t *uid, 2080static int shmem_parse_options(char *options, int *mode, uid_t *uid,
diff --git a/mm/slub.c b/mm/slub.c
index e29a42988c7..aac1dd3c657 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
20#include <linux/mempolicy.h> 20#include <linux/mempolicy.h>
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/kallsyms.h> 22#include <linux/kallsyms.h>
23#include <linux/memory.h>
23 24
24/* 25/*
25 * Lock order: 26 * Lock order:
@@ -2694,6 +2695,121 @@ int kmem_cache_shrink(struct kmem_cache *s)
2694} 2695}
2695EXPORT_SYMBOL(kmem_cache_shrink); 2696EXPORT_SYMBOL(kmem_cache_shrink);
2696 2697
2698#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2699static int slab_mem_going_offline_callback(void *arg)
2700{
2701 struct kmem_cache *s;
2702
2703 down_read(&slub_lock);
2704 list_for_each_entry(s, &slab_caches, list)
2705 kmem_cache_shrink(s);
2706 up_read(&slub_lock);
2707
2708 return 0;
2709}
2710
2711static void slab_mem_offline_callback(void *arg)
2712{
2713 struct kmem_cache_node *n;
2714 struct kmem_cache *s;
2715 struct memory_notify *marg = arg;
2716 int offline_node;
2717
2718 offline_node = marg->status_change_nid;
2719
2720 /*
2721 * If the node still has available memory. we need kmem_cache_node
2722 * for it yet.
2723 */
2724 if (offline_node < 0)
2725 return;
2726
2727 down_read(&slub_lock);
2728 list_for_each_entry(s, &slab_caches, list) {
2729 n = get_node(s, offline_node);
2730 if (n) {
2731 /*
2732 * if n->nr_slabs > 0, slabs still exist on the node
2733 * that is going down. We were unable to free them,
2734 * and offline_pages() function shoudn't call this
2735 * callback. So, we must fail.
2736 */
2737 BUG_ON(atomic_read(&n->nr_slabs));
2738
2739 s->node[offline_node] = NULL;
2740 kmem_cache_free(kmalloc_caches, n);
2741 }
2742 }
2743 up_read(&slub_lock);
2744}
2745
2746static int slab_mem_going_online_callback(void *arg)
2747{
2748 struct kmem_cache_node *n;
2749 struct kmem_cache *s;
2750 struct memory_notify *marg = arg;
2751 int nid = marg->status_change_nid;
2752 int ret = 0;
2753
2754 /*
2755 * If the node's memory is already available, then kmem_cache_node is
2756 * already created. Nothing to do.
2757 */
2758 if (nid < 0)
2759 return 0;
2760
2761 /*
2762 * We are bringing a node online. No memory is availabe yet. We must
2763 * allocate a kmem_cache_node structure in order to bring the node
2764 * online.
2765 */
2766 down_read(&slub_lock);
2767 list_for_each_entry(s, &slab_caches, list) {
2768 /*
2769 * XXX: kmem_cache_alloc_node will fallback to other nodes
2770 * since memory is not yet available from the node that
2771 * is brought up.
2772 */
2773 n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
2774 if (!n) {
2775 ret = -ENOMEM;
2776 goto out;
2777 }
2778 init_kmem_cache_node(n);
2779 s->node[nid] = n;
2780 }
2781out:
2782 up_read(&slub_lock);
2783 return ret;
2784}
2785
2786static int slab_memory_callback(struct notifier_block *self,
2787 unsigned long action, void *arg)
2788{
2789 int ret = 0;
2790
2791 switch (action) {
2792 case MEM_GOING_ONLINE:
2793 ret = slab_mem_going_online_callback(arg);
2794 break;
2795 case MEM_GOING_OFFLINE:
2796 ret = slab_mem_going_offline_callback(arg);
2797 break;
2798 case MEM_OFFLINE:
2799 case MEM_CANCEL_ONLINE:
2800 slab_mem_offline_callback(arg);
2801 break;
2802 case MEM_ONLINE:
2803 case MEM_CANCEL_OFFLINE:
2804 break;
2805 }
2806
2807 ret = notifier_from_errno(ret);
2808 return ret;
2809}
2810
2811#endif /* CONFIG_MEMORY_HOTPLUG */
2812
2697/******************************************************************** 2813/********************************************************************
2698 * Basic setup of slabs 2814 * Basic setup of slabs
2699 *******************************************************************/ 2815 *******************************************************************/
@@ -2715,6 +2831,8 @@ void __init kmem_cache_init(void)
2715 sizeof(struct kmem_cache_node), GFP_KERNEL); 2831 sizeof(struct kmem_cache_node), GFP_KERNEL);
2716 kmalloc_caches[0].refcount = -1; 2832 kmalloc_caches[0].refcount = -1;
2717 caches++; 2833 caches++;
2834
2835 hotplug_memory_notifier(slab_memory_callback, 1);
2718#endif 2836#endif
2719 2837
2720 /* Able to allocate the per node structures */ 2838 /* Able to allocate the per node structures */
diff --git a/security/commoncap.c b/security/commoncap.c
index 43f902750a1..bf67871173e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -190,7 +190,8 @@ int cap_inode_killpriv(struct dentry *dentry)
190 return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS); 190 return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
191} 191}
192 192
193static inline int cap_from_disk(__le32 *caps, struct linux_binprm *bprm, 193static inline int cap_from_disk(struct vfs_cap_data *caps,
194 struct linux_binprm *bprm,
194 int size) 195 int size)
195{ 196{
196 __u32 magic_etc; 197 __u32 magic_etc;
@@ -198,7 +199,7 @@ static inline int cap_from_disk(__le32 *caps, struct linux_binprm *bprm,
198 if (size != XATTR_CAPS_SZ) 199 if (size != XATTR_CAPS_SZ)
199 return -EINVAL; 200 return -EINVAL;
200 201
201 magic_etc = le32_to_cpu(caps[0]); 202 magic_etc = le32_to_cpu(caps->magic_etc);
202 203
203 switch ((magic_etc & VFS_CAP_REVISION_MASK)) { 204 switch ((magic_etc & VFS_CAP_REVISION_MASK)) {
204 case VFS_CAP_REVISION: 205 case VFS_CAP_REVISION:
@@ -206,8 +207,8 @@ static inline int cap_from_disk(__le32 *caps, struct linux_binprm *bprm,
206 bprm->cap_effective = true; 207 bprm->cap_effective = true;
207 else 208 else
208 bprm->cap_effective = false; 209 bprm->cap_effective = false;
209 bprm->cap_permitted = to_cap_t( le32_to_cpu(caps[1]) ); 210 bprm->cap_permitted = to_cap_t(le32_to_cpu(caps->permitted));
210 bprm->cap_inheritable = to_cap_t( le32_to_cpu(caps[2]) ); 211 bprm->cap_inheritable = to_cap_t(le32_to_cpu(caps->inheritable));
211 return 0; 212 return 0;
212 default: 213 default:
213 return -EINVAL; 214 return -EINVAL;
@@ -219,7 +220,7 @@ static int get_file_caps(struct linux_binprm *bprm)
219{ 220{
220 struct dentry *dentry; 221 struct dentry *dentry;
221 int rc = 0; 222 int rc = 0;
222 __le32 v1caps[XATTR_CAPS_SZ]; 223 struct vfs_cap_data incaps;
223 struct inode *inode; 224 struct inode *inode;
224 225
225 if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID) { 226 if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID) {
@@ -232,8 +233,14 @@ static int get_file_caps(struct linux_binprm *bprm)
232 if (!inode->i_op || !inode->i_op->getxattr) 233 if (!inode->i_op || !inode->i_op->getxattr)
233 goto out; 234 goto out;
234 235
235 rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, &v1caps, 236 rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
236 XATTR_CAPS_SZ); 237 if (rc > 0) {
238 if (rc == XATTR_CAPS_SZ)
239 rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS,
240 &incaps, XATTR_CAPS_SZ);
241 else
242 rc = -EINVAL;
243 }
237 if (rc == -ENODATA || rc == -EOPNOTSUPP) { 244 if (rc == -ENODATA || rc == -EOPNOTSUPP) {
238 /* no data, that's ok */ 245 /* no data, that's ok */
239 rc = 0; 246 rc = 0;
@@ -242,7 +249,7 @@ static int get_file_caps(struct linux_binprm *bprm)
242 if (rc < 0) 249 if (rc < 0)
243 goto out; 250 goto out;
244 251
245 rc = cap_from_disk(v1caps, bprm, rc); 252 rc = cap_from_disk(&incaps, bprm, rc);
246 if (rc) 253 if (rc)
247 printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n", 254 printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
248 __FUNCTION__, rc, bprm->filename); 255 __FUNCTION__, rc, bprm->filename);