diff options
Diffstat (limited to 'Documentation/x86/exception-tables.txt')
| -rw-r--r-- | Documentation/x86/exception-tables.txt | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt new file mode 100644 index 000000000000..32901aa36f0a --- /dev/null +++ b/Documentation/x86/exception-tables.txt | |||
| @@ -0,0 +1,292 @@ | |||
| 1 | Kernel level exception handling in Linux | ||
| 2 | Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com> | ||
| 3 | |||
| 4 | When a process runs in kernel mode, it often has to access user | ||
| 5 | mode memory whose address has been passed by an untrusted program. | ||
| 6 | To protect itself the kernel has to verify this address. | ||
| 7 | |||
| 8 | In older versions of Linux this was done with the | ||
| 9 | int verify_area(int type, const void * addr, unsigned long size) | ||
| 10 | function (which has since been replaced by access_ok()). | ||
| 11 | |||
| 12 | This function verified that the memory area starting at address | ||
| 13 | 'addr' and of size 'size' was accessible for the operation specified | ||
| 14 | in type (read or write). To do this, verify_read had to look up the | ||
| 15 | virtual memory area (vma) that contained the address addr. In the | ||
| 16 | normal case (correctly working program), this test was successful. | ||
| 17 | It only failed for a few buggy programs. In some kernel profiling | ||
| 18 | tests, this normally unneeded verification used up a considerable | ||
| 19 | amount of time. | ||
| 20 | |||
| 21 | To overcome this situation, Linus decided to let the virtual memory | ||
| 22 | hardware present in every Linux-capable CPU handle this test. | ||
| 23 | |||
| 24 | How does this work? | ||
| 25 | |||
| 26 | Whenever the kernel tries to access an address that is currently not | ||
| 27 | accessible, the CPU generates a page fault exception and calls the | ||
| 28 | page fault handler | ||
| 29 | |||
| 30 | void do_page_fault(struct pt_regs *regs, unsigned long error_code) | ||
| 31 | |||
| 32 | in arch/x86/mm/fault.c. The parameters on the stack are set up by | ||
| 33 | the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter | ||
| 34 | regs is a pointer to the saved registers on the stack, error_code | ||
| 35 | contains a reason code for the exception. | ||
| 36 | |||
| 37 | do_page_fault first obtains the unaccessible address from the CPU | ||
| 38 | control register CR2. If the address is within the virtual address | ||
| 39 | space of the process, the fault probably occurred, because the page | ||
| 40 | was not swapped in, write protected or something similar. However, | ||
| 41 | we are interested in the other case: the address is not valid, there | ||
| 42 | is no vma that contains this address. In this case, the kernel jumps | ||
| 43 | to the bad_area label. | ||
| 44 | |||
| 45 | There it uses the address of the instruction that caused the exception | ||
| 46 | (i.e. regs->eip) to find an address where the execution can continue | ||
| 47 | (fixup). If this search is successful, the fault handler modifies the | ||
| 48 | return address (again regs->eip) and returns. The execution will | ||
| 49 | continue at the address in fixup. | ||
| 50 | |||
| 51 | Where does fixup point to? | ||
| 52 | |||
| 53 | Since we jump to the contents of fixup, fixup obviously points | ||
| 54 | to executable code. This code is hidden inside the user access macros. | ||
| 55 | I have picked the get_user macro defined in arch/x86/include/asm/uaccess.h | ||
| 56 | as an example. The definition is somewhat hard to follow, so let's peek at | ||
| 57 | the code generated by the preprocessor and the compiler. I selected | ||
| 58 | the get_user call in drivers/char/sysrq.c for a detailed examination. | ||
| 59 | |||
| 60 | The original code in sysrq.c line 587: | ||
| 61 | get_user(c, buf); | ||
| 62 | |||
| 63 | The preprocessor output (edited to become somewhat readable): | ||
| 64 | |||
| 65 | ( | ||
| 66 | { | ||
| 67 | long __gu_err = - 14 , __gu_val = 0; | ||
| 68 | const __typeof__(*( ( buf ) )) *__gu_addr = ((buf)); | ||
| 69 | if (((((0 + current_set[0])->tss.segment) == 0x18 ) || | ||
| 70 | (((sizeof(*(buf))) <= 0xC0000000UL) && | ||
| 71 | ((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf))))))) | ||
| 72 | do { | ||
| 73 | __gu_err = 0; | ||
| 74 | switch ((sizeof(*(buf)))) { | ||
| 75 | case 1: | ||
| 76 | __asm__ __volatile__( | ||
| 77 | "1: mov" "b" " %2,%" "b" "1\n" | ||
| 78 | "2:\n" | ||
| 79 | ".section .fixup,\"ax\"\n" | ||
| 80 | "3: movl %3,%0\n" | ||
| 81 | " xor" "b" " %" "b" "1,%" "b" "1\n" | ||
| 82 | " jmp 2b\n" | ||
| 83 | ".section __ex_table,\"a\"\n" | ||
| 84 | " .align 4\n" | ||
| 85 | " .long 1b,3b\n" | ||
| 86 | ".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *) | ||
| 87 | ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ; | ||
| 88 | break; | ||
| 89 | case 2: | ||
| 90 | __asm__ __volatile__( | ||
| 91 | "1: mov" "w" " %2,%" "w" "1\n" | ||
| 92 | "2:\n" | ||
| 93 | ".section .fixup,\"ax\"\n" | ||
| 94 | "3: movl %3,%0\n" | ||
| 95 | " xor" "w" " %" "w" "1,%" "w" "1\n" | ||
| 96 | " jmp 2b\n" | ||
| 97 | ".section __ex_table,\"a\"\n" | ||
| 98 | " .align 4\n" | ||
| 99 | " .long 1b,3b\n" | ||
| 100 | ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) | ||
| 101 | ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )); | ||
| 102 | break; | ||
| 103 | case 4: | ||
| 104 | __asm__ __volatile__( | ||
| 105 | "1: mov" "l" " %2,%" "" "1\n" | ||
| 106 | "2:\n" | ||
| 107 | ".section .fixup,\"ax\"\n" | ||
| 108 | "3: movl %3,%0\n" | ||
| 109 | " xor" "l" " %" "" "1,%" "" "1\n" | ||
| 110 | " jmp 2b\n" | ||
| 111 | ".section __ex_table,\"a\"\n" | ||
| 112 | " .align 4\n" " .long 1b,3b\n" | ||
| 113 | ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) | ||
| 114 | ( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err)); | ||
| 115 | break; | ||
| 116 | default: | ||
| 117 | (__gu_val) = __get_user_bad(); | ||
| 118 | } | ||
| 119 | } while (0) ; | ||
| 120 | ((c)) = (__typeof__(*((buf))))__gu_val; | ||
| 121 | __gu_err; | ||
| 122 | } | ||
| 123 | ); | ||
| 124 | |||
| 125 | WOW! Black GCC/assembly magic. This is impossible to follow, so let's | ||
| 126 | see what code gcc generates: | ||
| 127 | |||
| 128 | > xorl %edx,%edx | ||
| 129 | > movl current_set,%eax | ||
| 130 | > cmpl $24,788(%eax) | ||
| 131 | > je .L1424 | ||
| 132 | > cmpl $-1073741825,64(%esp) | ||
| 133 | > ja .L1423 | ||
| 134 | > .L1424: | ||
| 135 | > movl %edx,%eax | ||
| 136 | > movl 64(%esp),%ebx | ||
| 137 | > #APP | ||
| 138 | > 1: movb (%ebx),%dl /* this is the actual user access */ | ||
| 139 | > 2: | ||
| 140 | > .section .fixup,"ax" | ||
| 141 | > 3: movl $-14,%eax | ||
| 142 | > xorb %dl,%dl | ||
| 143 | > jmp 2b | ||
| 144 | > .section __ex_table,"a" | ||
| 145 | > .align 4 | ||
| 146 | > .long 1b,3b | ||
| 147 | > .text | ||
| 148 | > #NO_APP | ||
| 149 | > .L1423: | ||
| 150 | > movzbl %dl,%esi | ||
| 151 | |||
| 152 | The optimizer does a good job and gives us something we can actually | ||
| 153 | understand. Can we? The actual user access is quite obvious. Thanks | ||
| 154 | to the unified address space we can just access the address in user | ||
| 155 | memory. But what does the .section stuff do????? | ||
| 156 | |||
| 157 | To understand this we have to look at the final kernel: | ||
| 158 | |||
| 159 | > objdump --section-headers vmlinux | ||
| 160 | > | ||
| 161 | > vmlinux: file format elf32-i386 | ||
| 162 | > | ||
| 163 | > Sections: | ||
| 164 | > Idx Name Size VMA LMA File off Algn | ||
| 165 | > 0 .text 00098f40 c0100000 c0100000 00001000 2**4 | ||
| 166 | > CONTENTS, ALLOC, LOAD, READONLY, CODE | ||
| 167 | > 1 .fixup 000016bc c0198f40 c0198f40 00099f40 2**0 | ||
| 168 | > CONTENTS, ALLOC, LOAD, READONLY, CODE | ||
| 169 | > 2 .rodata 0000f127 c019a5fc c019a5fc 0009b5fc 2**2 | ||
| 170 | > CONTENTS, ALLOC, LOAD, READONLY, DATA | ||
| 171 | > 3 __ex_table 000015c0 c01a9724 c01a9724 000aa724 2**2 | ||
| 172 | > CONTENTS, ALLOC, LOAD, READONLY, DATA | ||
| 173 | > 4 .data 0000ea58 c01abcf0 c01abcf0 000abcf0 2**4 | ||
| 174 | > CONTENTS, ALLOC, LOAD, DATA | ||
| 175 | > 5 .bss 00018e21 c01ba748 c01ba748 000ba748 2**2 | ||
| 176 | > ALLOC | ||
| 177 | > 6 .comment 00000ec4 00000000 00000000 000ba748 2**0 | ||
| 178 | > CONTENTS, READONLY | ||
| 179 | > 7 .note 00001068 00000ec4 00000ec4 000bb60c 2**0 | ||
| 180 | > CONTENTS, READONLY | ||
| 181 | |||
| 182 | There are obviously 2 non standard ELF sections in the generated object | ||
| 183 | file. But first we want to find out what happened to our code in the | ||
| 184 | final kernel executable: | ||
| 185 | |||
| 186 | > objdump --disassemble --section=.text vmlinux | ||
| 187 | > | ||
| 188 | > c017e785 <do_con_write+c1> xorl %edx,%edx | ||
| 189 | > c017e787 <do_con_write+c3> movl 0xc01c7bec,%eax | ||
| 190 | > c017e78c <do_con_write+c8> cmpl $0x18,0x314(%eax) | ||
| 191 | > c017e793 <do_con_write+cf> je c017e79f <do_con_write+db> | ||
| 192 | > c017e795 <do_con_write+d1> cmpl $0xbfffffff,0x40(%esp,1) | ||
| 193 | > c017e79d <do_con_write+d9> ja c017e7a7 <do_con_write+e3> | ||
| 194 | > c017e79f <do_con_write+db> movl %edx,%eax | ||
| 195 | > c017e7a1 <do_con_write+dd> movl 0x40(%esp,1),%ebx | ||
| 196 | > c017e7a5 <do_con_write+e1> movb (%ebx),%dl | ||
| 197 | > c017e7a7 <do_con_write+e3> movzbl %dl,%esi | ||
| 198 | |||
| 199 | The whole user memory access is reduced to 10 x86 machine instructions. | ||
| 200 | The instructions bracketed in the .section directives are no longer | ||
| 201 | in the normal execution path. They are located in a different section | ||
| 202 | of the executable file: | ||
| 203 | |||
| 204 | > objdump --disassemble --section=.fixup vmlinux | ||
| 205 | > | ||
| 206 | > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax | ||
| 207 | > c0199ffa <.fixup+10ba> xorb %dl,%dl | ||
| 208 | > c0199ffc <.fixup+10bc> jmp c017e7a7 <do_con_write+e3> | ||
| 209 | |||
| 210 | And finally: | ||
| 211 | > objdump --full-contents --section=__ex_table vmlinux | ||
| 212 | > | ||
| 213 | > c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................ | ||
| 214 | > c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................ | ||
| 215 | > c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................ | ||
| 216 | |||
| 217 | or in human readable byte order: | ||
| 218 | |||
| 219 | > c01aa7c4 c017c093 c0199fe0 c017c097 c017c099 ................ | ||
| 220 | > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................ | ||
| 221 | ^^^^^^^^^^^^^^^^^ | ||
| 222 | this is the interesting part! | ||
| 223 | > c01aa7e4 c0180a08 c019a001 c0180a0a c019a004 ................ | ||
| 224 | |||
| 225 | What happened? The assembly directives | ||
| 226 | |||
| 227 | .section .fixup,"ax" | ||
| 228 | .section __ex_table,"a" | ||
| 229 | |||
| 230 | told the assembler to move the following code to the specified | ||
| 231 | sections in the ELF object file. So the instructions | ||
| 232 | 3: movl $-14,%eax | ||
| 233 | xorb %dl,%dl | ||
| 234 | jmp 2b | ||
| 235 | ended up in the .fixup section of the object file and the addresses | ||
| 236 | .long 1b,3b | ||
| 237 | ended up in the __ex_table section of the object file. 1b and 3b | ||
| 238 | are local labels. The local label 1b (1b stands for next label 1 | ||
| 239 | backward) is the address of the instruction that might fault, i.e. | ||
| 240 | in our case the address of the label 1 is c017e7a5: | ||
| 241 | the original assembly code: > 1: movb (%ebx),%dl | ||
| 242 | and linked in vmlinux : > c017e7a5 <do_con_write+e1> movb (%ebx),%dl | ||
| 243 | |||
| 244 | The local label 3 (backwards again) is the address of the code to handle | ||
| 245 | the fault, in our case the actual value is c0199ff5: | ||
| 246 | the original assembly code: > 3: movl $-14,%eax | ||
| 247 | and linked in vmlinux : > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax | ||
| 248 | |||
| 249 | The assembly code | ||
| 250 | > .section __ex_table,"a" | ||
| 251 | > .align 4 | ||
| 252 | > .long 1b,3b | ||
| 253 | |||
| 254 | becomes the value pair | ||
| 255 | > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................ | ||
| 256 | ^this is ^this is | ||
| 257 | 1b 3b | ||
| 258 | c017e7a5,c0199ff5 in the exception table of the kernel. | ||
| 259 | |||
| 260 | So, what actually happens if a fault from kernel mode with no suitable | ||
| 261 | vma occurs? | ||
| 262 | |||
| 263 | 1.) access to invalid address: | ||
| 264 | > c017e7a5 <do_con_write+e1> movb (%ebx),%dl | ||
| 265 | 2.) MMU generates exception | ||
| 266 | 3.) CPU calls do_page_fault | ||
| 267 | 4.) do page fault calls search_exception_table (regs->eip == c017e7a5); | ||
| 268 | 5.) search_exception_table looks up the address c017e7a5 in the | ||
| 269 | exception table (i.e. the contents of the ELF section __ex_table) | ||
| 270 | and returns the address of the associated fault handle code c0199ff5. | ||
| 271 | 6.) do_page_fault modifies its own return address to point to the fault | ||
| 272 | handle code and returns. | ||
| 273 | 7.) execution continues in the fault handling code. | ||
| 274 | 8.) 8a) EAX becomes -EFAULT (== -14) | ||
| 275 | 8b) DL becomes zero (the value we "read" from user space) | ||
| 276 | 8c) execution continues at local label 2 (address of the | ||
| 277 | instruction immediately after the faulting user access). | ||
| 278 | |||
| 279 | The steps 8a to 8c in a certain way emulate the faulting instruction. | ||
| 280 | |||
| 281 | That's it, mostly. If you look at our example, you might ask why | ||
| 282 | we set EAX to -EFAULT in the exception handler code. Well, the | ||
| 283 | get_user macro actually returns a value: 0, if the user access was | ||
| 284 | successful, -EFAULT on failure. Our original code did not test this | ||
| 285 | return value, however the inline assembly code in get_user tries to | ||
| 286 | return -EFAULT. GCC selected EAX to return this value. | ||
| 287 | |||
| 288 | NOTE: | ||
| 289 | Due to the way that the exception table is built and needs to be ordered, | ||
| 290 | only use exceptions for code in the .text section. Any other section | ||
| 291 | will cause the exception table to not be sorted correctly, and the | ||
| 292 | exceptions will fail. | ||
