diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 20:05:15 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 20:05:15 -0400 |
| commit | 8e204874db000928e37199c2db82b7eb8966cc3c (patch) | |
| tree | eae66035cb761c3c5a79e98b92280b5156bc01ef | |
| parent | 3e0b8df79ddb8955d2cce5e858972a9cfe763384 (diff) | |
| parent | aafade242ff24fac3aabf61c7861dfa44a3c2445 (diff) | |
Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86-64, vdso: Do not allocate memory for the vDSO
clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option
x86, vdso: Drop now wrong comment
Document the vDSO and add a reference parser
ia64: Replace clocksource.fsys_mmio with generic arch data
x86-64: Move vread_tsc and vread_hpet into the vDSO
clocksource: Replace vread with generic arch data
x86-64: Add --no-undefined to vDSO build
x86-64: Allow alternative patching in the vDSO
x86: Make alternative instruction pointers relative
x86-64: Improve vsyscall emulation CS and RIP handling
x86-64: Emulate legacy vsyscalls
x86-64: Fill unused parts of the vsyscall page with 0xcc
x86-64: Remove vsyscall number 3 (venosys)
x86-64: Map the HPET NX
x86-64: Remove kernel.vsyscall64 sysctl
x86-64: Give vvars their own page
x86-64: Document some of entry_64.S
x86-64: Fix alignment of jiffies variable
41 files changed, 927 insertions, 378 deletions
diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso new file mode 100644 index 000000000000..8a1cbb594497 --- /dev/null +++ b/Documentation/ABI/stable/vdso | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | On some architectures, when the kernel loads any userspace program it | ||
| 2 | maps an ELF DSO into that program's address space. This DSO is called | ||
| 3 | the vDSO and it often contains useful and highly-optimized alternatives | ||
| 4 | to real syscalls. | ||
| 5 | |||
| 6 | These functions are called just like ordinary C function according to | ||
| 7 | your platform's ABI. Call them from a sensible context. (For example, | ||
| 8 | if you set CS on x86 to something strange, the vDSO functions are | ||
| 9 | within their rights to crash.) In addition, if you pass a bad | ||
| 10 | pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT. | ||
| 11 | |||
| 12 | To find the DSO, parse the auxiliary vector passed to the program's | ||
| 13 | entry point. The AT_SYSINFO_EHDR entry will point to the vDSO. | ||
| 14 | |||
| 15 | The vDSO uses symbol versioning; whenever you request a symbol from the | ||
| 16 | vDSO, specify the version you are expecting. | ||
| 17 | |||
| 18 | Programs that dynamically link to glibc will use the vDSO automatically. | ||
| 19 | Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c. | ||
| 20 | |||
| 21 | Unless otherwise noted, the set of symbols with any given version and the | ||
| 22 | ABI of those symbols is considered stable. It may vary across architectures, | ||
| 23 | though. | ||
| 24 | |||
| 25 | (As of this writing, this ABI documentation as been confirmed for x86_64. | ||
| 26 | The maintainers of the other vDSO-using architectures should confirm | ||
| 27 | that it is correct for their architecture.) \ No newline at end of file | ||
diff --git a/Documentation/vDSO/parse_vdso.c b/Documentation/vDSO/parse_vdso.c new file mode 100644 index 000000000000..85870208edcf --- /dev/null +++ b/Documentation/vDSO/parse_vdso.c | |||
| @@ -0,0 +1,256 @@ | |||
| 1 | /* | ||
| 2 | * parse_vdso.c: Linux reference vDSO parser | ||
| 3 | * Written by Andrew Lutomirski, 2011. | ||
| 4 | * | ||
| 5 | * This code is meant to be linked in to various programs that run on Linux. | ||
| 6 | * As such, it is available with as few restrictions as possible. This file | ||
| 7 | * is licensed under the Creative Commons Zero License, version 1.0, | ||
| 8 | * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode | ||
| 9 | * | ||
| 10 | * The vDSO is a regular ELF DSO that the kernel maps into user space when | ||
| 11 | * it starts a program. It works equally well in statically and dynamically | ||
| 12 | * linked binaries. | ||
| 13 | * | ||
| 14 | * This code is tested on x86_64. In principle it should work on any 64-bit | ||
| 15 | * architecture that has a vDSO. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #include <stdbool.h> | ||
| 19 | #include <stdint.h> | ||
| 20 | #include <string.h> | ||
| 21 | #include <elf.h> | ||
| 22 | |||
| 23 | /* | ||
| 24 | * To use this vDSO parser, first call one of the vdso_init_* functions. | ||
| 25 | * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR | ||
| 26 | * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. | ||
| 27 | * Then call vdso_sym for each symbol you want. For example, to look up | ||
| 28 | * gettimeofday on x86_64, use: | ||
| 29 | * | ||
| 30 | * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); | ||
| 31 | * or | ||
| 32 | * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); | ||
| 33 | * | ||
| 34 | * vdso_sym will return 0 if the symbol doesn't exist or if the init function | ||
| 35 | * failed or was not called. vdso_sym is a little slow, so its return value | ||
| 36 | * should be cached. | ||
| 37 | * | ||
| 38 | * vdso_sym is threadsafe; the init functions are not. | ||
| 39 | * | ||
| 40 | * These are the prototypes: | ||
| 41 | */ | ||
| 42 | extern void vdso_init_from_auxv(void *auxv); | ||
| 43 | extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); | ||
| 44 | extern void *vdso_sym(const char *version, const char *name); | ||
| 45 | |||
| 46 | |||
| 47 | /* And here's the code. */ | ||
| 48 | |||
| 49 | #ifndef __x86_64__ | ||
| 50 | # error Not yet ported to non-x86_64 architectures | ||
| 51 | #endif | ||
| 52 | |||
| 53 | static struct vdso_info | ||
| 54 | { | ||
| 55 | bool valid; | ||
| 56 | |||
| 57 | /* Load information */ | ||
| 58 | uintptr_t load_addr; | ||
| 59 | uintptr_t load_offset; /* load_addr - recorded vaddr */ | ||
| 60 | |||
| 61 | /* Symbol table */ | ||
| 62 | Elf64_Sym *symtab; | ||
| 63 | const char *symstrings; | ||
| 64 | Elf64_Word *bucket, *chain; | ||
| 65 | Elf64_Word nbucket, nchain; | ||
| 66 | |||
| 67 | /* Version table */ | ||
| 68 | Elf64_Versym *versym; | ||
| 69 | Elf64_Verdef *verdef; | ||
| 70 | } vdso_info; | ||
| 71 | |||
| 72 | /* Straight from the ELF specification. */ | ||
| 73 | static unsigned long elf_hash(const unsigned char *name) | ||
| 74 | { | ||
| 75 | unsigned long h = 0, g; | ||
| 76 | while (*name) | ||
| 77 | { | ||
| 78 | h = (h << 4) + *name++; | ||
| 79 | if (g = h & 0xf0000000) | ||
| 80 | h ^= g >> 24; | ||
| 81 | h &= ~g; | ||
| 82 | } | ||
| 83 | return h; | ||
| 84 | } | ||
| 85 | |||
| 86 | void vdso_init_from_sysinfo_ehdr(uintptr_t base) | ||
| 87 | { | ||
| 88 | size_t i; | ||
| 89 | bool found_vaddr = false; | ||
| 90 | |||
| 91 | vdso_info.valid = false; | ||
| 92 | |||
| 93 | vdso_info.load_addr = base; | ||
| 94 | |||
| 95 | Elf64_Ehdr *hdr = (Elf64_Ehdr*)base; | ||
| 96 | Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff); | ||
| 97 | Elf64_Dyn *dyn = 0; | ||
| 98 | |||
| 99 | /* | ||
| 100 | * We need two things from the segment table: the load offset | ||
| 101 | * and the dynamic table. | ||
| 102 | */ | ||
| 103 | for (i = 0; i < hdr->e_phnum; i++) | ||
| 104 | { | ||
| 105 | if (pt[i].p_type == PT_LOAD && !found_vaddr) { | ||
| 106 | found_vaddr = true; | ||
| 107 | vdso_info.load_offset = base | ||
| 108 | + (uintptr_t)pt[i].p_offset | ||
| 109 | - (uintptr_t)pt[i].p_vaddr; | ||
| 110 | } else if (pt[i].p_type == PT_DYNAMIC) { | ||
| 111 | dyn = (Elf64_Dyn*)(base + pt[i].p_offset); | ||
| 112 | } | ||
| 113 | } | ||
| 114 | |||
| 115 | if (!found_vaddr || !dyn) | ||
| 116 | return; /* Failed */ | ||
| 117 | |||
| 118 | /* | ||
| 119 | * Fish out the useful bits of the dynamic table. | ||
| 120 | */ | ||
| 121 | Elf64_Word *hash = 0; | ||
| 122 | vdso_info.symstrings = 0; | ||
| 123 | vdso_info.symtab = 0; | ||
| 124 | vdso_info.versym = 0; | ||
| 125 | vdso_info.verdef = 0; | ||
| 126 | for (i = 0; dyn[i].d_tag != DT_NULL; i++) { | ||
| 127 | switch (dyn[i].d_tag) { | ||
| 128 | case DT_STRTAB: | ||
| 129 | vdso_info.symstrings = (const char *) | ||
| 130 | ((uintptr_t)dyn[i].d_un.d_ptr | ||
| 131 | + vdso_info.load_offset); | ||
| 132 | break; | ||
| 133 | case DT_SYMTAB: | ||
| 134 | vdso_info.symtab = (Elf64_Sym *) | ||
| 135 | ((uintptr_t)dyn[i].d_un.d_ptr | ||
| 136 | + vdso_info.load_offset); | ||
| 137 | break; | ||
| 138 | case DT_HASH: | ||
| 139 | hash = (Elf64_Word *) | ||
| 140 | ((uintptr_t)dyn[i].d_un.d_ptr | ||
| 141 | + vdso_info.load_offset); | ||
| 142 | break; | ||
| 143 | case DT_VERSYM: | ||
| 144 | vdso_info.versym = (Elf64_Versym *) | ||
| 145 | ((uintptr_t)dyn[i].d_un.d_ptr | ||
| 146 | + vdso_info.load_offset); | ||
| 147 | break; | ||
| 148 | case DT_VERDEF: | ||
| 149 | vdso_info.verdef = (Elf64_Verdef *) | ||
| 150 | ((uintptr_t)dyn[i].d_un.d_ptr | ||
| 151 | + vdso_info.load_offset); | ||
| 152 | break; | ||
| 153 | } | ||
| 154 | } | ||
| 155 | if (!vdso_info.symstrings || !vdso_info.symtab || !hash) | ||
| 156 | return; /* Failed */ | ||
| 157 | |||
| 158 | if (!vdso_info.verdef) | ||
| 159 | vdso_info.versym = 0; | ||
| 160 | |||
| 161 | /* Parse the hash table header. */ | ||
| 162 | vdso_info.nbucket = hash[0]; | ||
| 163 | vdso_info.nchain = hash[1]; | ||
| 164 | vdso_info.bucket = &hash[2]; | ||
| 165 | vdso_info.chain = &hash[vdso_info.nbucket + 2]; | ||
| 166 | |||
| 167 | /* That's all we need. */ | ||
| 168 | vdso_info.valid = true; | ||
| 169 | } | ||
| 170 | |||
| 171 | static bool vdso_match_version(Elf64_Versym ver, | ||
| 172 | const char *name, Elf64_Word hash) | ||
| 173 | { | ||
| 174 | /* | ||
| 175 | * This is a helper function to check if the version indexed by | ||
| 176 | * ver matches name (which hashes to hash). | ||
| 177 | * | ||
| 178 | * The version definition table is a mess, and I don't know how | ||
| 179 | * to do this in better than linear time without allocating memory | ||
| 180 | * to build an index. I also don't know why the table has | ||
| 181 | * variable size entries in the first place. | ||
| 182 | * | ||
| 183 | * For added fun, I can't find a comprehensible specification of how | ||
| 184 | * to parse all the weird flags in the table. | ||
| 185 | * | ||
| 186 | * So I just parse the whole table every time. | ||
| 187 | */ | ||
| 188 | |||
| 189 | /* First step: find the version definition */ | ||
| 190 | ver &= 0x7fff; /* Apparently bit 15 means "hidden" */ | ||
| 191 | Elf64_Verdef *def = vdso_info.verdef; | ||
| 192 | while(true) { | ||
| 193 | if ((def->vd_flags & VER_FLG_BASE) == 0 | ||
| 194 | && (def->vd_ndx & 0x7fff) == ver) | ||
| 195 | break; | ||
| 196 | |||
| 197 | if (def->vd_next == 0) | ||
| 198 | return false; /* No definition. */ | ||
| 199 | |||
| 200 | def = (Elf64_Verdef *)((char *)def + def->vd_next); | ||
| 201 | } | ||
| 202 | |||
| 203 | /* Now figure out whether it matches. */ | ||
| 204 | Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux); | ||
| 205 | return def->vd_hash == hash | ||
| 206 | && !strcmp(name, vdso_info.symstrings + aux->vda_name); | ||
| 207 | } | ||
| 208 | |||
| 209 | void *vdso_sym(const char *version, const char *name) | ||
| 210 | { | ||
| 211 | unsigned long ver_hash; | ||
| 212 | if (!vdso_info.valid) | ||
| 213 | return 0; | ||
| 214 | |||
| 215 | ver_hash = elf_hash(version); | ||
| 216 | Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; | ||
| 217 | |||
| 218 | for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { | ||
| 219 | Elf64_Sym *sym = &vdso_info.symtab[chain]; | ||
| 220 | |||
| 221 | /* Check for a defined global or weak function w/ right name. */ | ||
| 222 | if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) | ||
| 223 | continue; | ||
| 224 | if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && | ||
| 225 | ELF64_ST_BIND(sym->st_info) != STB_WEAK) | ||
| 226 | continue; | ||
| 227 | if (sym->st_shndx == SHN_UNDEF) | ||
| 228 | continue; | ||
| 229 | if (strcmp(name, vdso_info.symstrings + sym->st_name)) | ||
| 230 | continue; | ||
| 231 | |||
| 232 | /* Check symbol version. */ | ||
| 233 | if (vdso_info.versym | ||
| 234 | && !vdso_match_version(vdso_info.versym[chain], | ||
| 235 | version, ver_hash)) | ||
| 236 | continue; | ||
| 237 | |||
| 238 | return (void *)(vdso_info.load_offset + sym->st_value); | ||
| 239 | } | ||
| 240 | |||
| 241 | return 0; | ||
| 242 | } | ||
| 243 | |||
| 244 | void vdso_init_from_auxv(void *auxv) | ||
| 245 | { | ||
| 246 | Elf64_auxv_t *elf_auxv = auxv; | ||
| 247 | for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++) | ||
| 248 | { | ||
| 249 | if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) { | ||
| 250 | vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val); | ||
| 251 | return; | ||
| 252 | } | ||
| 253 | } | ||
| 254 | |||
| 255 | vdso_info.valid = false; | ||
| 256 | } | ||
diff --git a/Documentation/vDSO/vdso_test.c b/Documentation/vDSO/vdso_test.c new file mode 100644 index 000000000000..fff633432dff --- /dev/null +++ b/Documentation/vDSO/vdso_test.c | |||
| @@ -0,0 +1,111 @@ | |||
| 1 | /* | ||
| 2 | * vdso_test.c: Sample code to test parse_vdso.c on x86_64 | ||
| 3 | * Copyright (c) 2011 Andy Lutomirski | ||
| 4 | * Subject to the GNU General Public License, version 2 | ||
| 5 | * | ||
| 6 | * You can amuse yourself by compiling with: | ||
| 7 | * gcc -std=gnu99 -nostdlib | ||
| 8 | * -Os -fno-asynchronous-unwind-tables -flto | ||
| 9 | * vdso_test.c parse_vdso.c -o vdso_test | ||
| 10 | * to generate a small binary with no dependencies at all. | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <sys/syscall.h> | ||
| 14 | #include <sys/time.h> | ||
| 15 | #include <unistd.h> | ||
| 16 | #include <stdint.h> | ||
| 17 | |||
| 18 | extern void *vdso_sym(const char *version, const char *name); | ||
| 19 | extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); | ||
| 20 | extern void vdso_init_from_auxv(void *auxv); | ||
| 21 | |||
| 22 | /* We need a libc functions... */ | ||
| 23 | int strcmp(const char *a, const char *b) | ||
| 24 | { | ||
| 25 | /* This implementation is buggy: it never returns -1. */ | ||
| 26 | while (*a || *b) { | ||
| 27 | if (*a != *b) | ||
| 28 | return 1; | ||
| 29 | if (*a == 0 || *b == 0) | ||
| 30 | return 1; | ||
| 31 | a++; | ||
| 32 | b++; | ||
| 33 | } | ||
| 34 | |||
| 35 | return 0; | ||
| 36 | } | ||
| 37 | |||
| 38 | /* ...and two syscalls. This is x86_64-specific. */ | ||
| 39 | static inline long linux_write(int fd, const void *data, size_t len) | ||
| 40 | { | ||
| 41 | |||
| 42 | long ret; | ||
| 43 | asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write), | ||
| 44 | "D" (fd), "S" (data), "d" (len) : | ||
| 45 | "cc", "memory", "rcx", | ||
| 46 | "r8", "r9", "r10", "r11" ); | ||
| 47 | return ret; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline void linux_exit(int code) | ||
| 51 | { | ||
| 52 | asm volatile ("syscall" : : "a" (__NR_exit), "D" (code)); | ||
| 53 | } | ||
| 54 | |||
| 55 | void to_base10(char *lastdig, uint64_t n) | ||
| 56 | { | ||
| 57 | while (n) { | ||
| 58 | *lastdig = (n % 10) + '0'; | ||
| 59 | n /= 10; | ||
| 60 | lastdig--; | ||
| 61 | } | ||
| 62 | } | ||
| 63 | |||
| 64 | __attribute__((externally_visible)) void c_main(void **stack) | ||
| 65 | { | ||
| 66 | /* Parse the stack */ | ||
| 67 | long argc = (long)*stack; | ||
| 68 | stack += argc + 2; | ||
| 69 | |||
| 70 | /* Now we're pointing at the environment. Skip it. */ | ||
| 71 | while(*stack) | ||
| 72 | stack++; | ||
| 73 | stack++; | ||
| 74 | |||
| 75 | /* Now we're pointing at auxv. Initialize the vDSO parser. */ | ||
| 76 | vdso_init_from_auxv((void *)stack); | ||
| 77 | |||
| 78 | /* Find gettimeofday. */ | ||
| 79 | typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); | ||
| 80 | gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); | ||
| 81 | |||
| 82 | if (!gtod) | ||
| 83 | linux_exit(1); | ||
| 84 | |||
| 85 | struct timeval tv; | ||
| 86 | long ret = gtod(&tv, 0); | ||
| 87 | |||
| 88 | if (ret == 0) { | ||
| 89 | char buf[] = "The time is .000000\n"; | ||
| 90 | to_base10(buf + 31, tv.tv_sec); | ||
| 91 | to_base10(buf + 38, tv.tv_usec); | ||
| 92 | linux_write(1, buf, sizeof(buf) - 1); | ||
| 93 | } else { | ||
| 94 | linux_exit(ret); | ||
| 95 | } | ||
| 96 | |||
| 97 | linux_exit(0); | ||
| 98 | } | ||
| 99 | |||
| 100 | /* | ||
| 101 | * This is the real entry point. It passes the initial stack into | ||
| 102 | * the C entry point. | ||
| 103 | */ | ||
| 104 | asm ( | ||
| 105 | ".text\n" | ||
| 106 | ".global _start\n" | ||
| 107 | ".type _start,@function\n" | ||
| 108 | "_start:\n\t" | ||
| 109 | "mov %rsp,%rdi\n\t" | ||
| 110 | "jmp c_main" | ||
| 111 | ); | ||
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt new file mode 100644 index 000000000000..7869f14d055c --- /dev/null +++ b/Documentation/x86/entry_64.txt | |||
| @@ -0,0 +1,98 @@ | |||
| 1 | This file documents some of the kernel entries in | ||
| 2 | arch/x86/kernel/entry_64.S. A lot of this explanation is adapted from | ||
| 3 | an email from Ingo Molnar: | ||
| 4 | |||
| 5 | http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu> | ||
| 6 | |||
| 7 | The x86 architecture has quite a few different ways to jump into | ||
| 8 | kernel code. Most of these entry points are registered in | ||
| 9 | arch/x86/kernel/traps.c and implemented in arch/x86/kernel/entry_64.S | ||
| 10 | and arch/x86/ia32/ia32entry.S. | ||
| 11 | |||
| 12 | The IDT vector assignments are listed in arch/x86/include/irq_vectors.h. | ||
| 13 | |||
| 14 | Some of these entries are: | ||
| 15 | |||
| 16 | - system_call: syscall instruction from 64-bit code. | ||
| 17 | |||
| 18 | - ia32_syscall: int 0x80 from 32-bit or 64-bit code; compat syscall | ||
| 19 | either way. | ||
| 20 | |||
| 21 | - ia32_syscall, ia32_sysenter: syscall and sysenter from 32-bit | ||
| 22 | code | ||
| 23 | |||
| 24 | - interrupt: An array of entries. Every IDT vector that doesn't | ||
| 25 | explicitly point somewhere else gets set to the corresponding | ||
| 26 | value in interrupts. These point to a whole array of | ||
| 27 | magically-generated functions that make their way to do_IRQ with | ||
| 28 | the interrupt number as a parameter. | ||
| 29 | |||
| 30 | - emulate_vsyscall: int 0xcc, a special non-ABI entry used by | ||
| 31 | vsyscall emulation. | ||
| 32 | |||
| 33 | - APIC interrupts: Various special-purpose interrupts for things | ||
| 34 | like TLB shootdown. | ||
| 35 | |||
| 36 | - Architecturally-defined exceptions like divide_error. | ||
| 37 | |||
| 38 | There are a few complexities here. The different x86-64 entries | ||
| 39 | have different calling conventions. The syscall and sysenter | ||
| 40 | instructions have their own peculiar calling conventions. Some of | ||
| 41 | the IDT entries push an error code onto the stack; others don't. | ||
| 42 | IDT entries using the IST alternative stack mechanism need their own | ||
| 43 | magic to get the stack frames right. (You can find some | ||
| 44 | documentation in the AMD APM, Volume 2, Chapter 8 and the Intel SDM, | ||
| 45 | Volume 3, Chapter 6.) | ||
| 46 | |||
| 47 | Dealing with the swapgs instruction is especially tricky. Swapgs | ||
| 48 | toggles whether gs is the kernel gs or the user gs. The swapgs | ||
| 49 | instruction is rather fragile: it must nest perfectly and only in | ||
| 50 | single depth, it should only be used if entering from user mode to | ||
| 51 | kernel mode and then when returning to user-space, and precisely | ||
| 52 | so. If we mess that up even slightly, we crash. | ||
| 53 | |||
| 54 | So when we have a secondary entry, already in kernel mode, we *must | ||
| 55 | not* use SWAPGS blindly - nor must we forget doing a SWAPGS when it's | ||
| 56 | not switched/swapped yet. | ||
| 57 | |||
| 58 | Now, there's a secondary complication: there's a cheap way to test | ||
| 59 | which mode the CPU is in and an expensive way. | ||
| 60 | |||
| 61 | The cheap way is to pick this info off the entry frame on the kernel | ||
| 62 | stack, from the CS of the ptregs area of the kernel stack: | ||
| 63 | |||
| 64 | xorl %ebx,%ebx | ||
| 65 | testl $3,CS+8(%rsp) | ||
| 66 | je error_kernelspace | ||
| 67 | SWAPGS | ||
| 68 | |||
| 69 | The expensive (paranoid) way is to read back the MSR_GS_BASE value | ||
| 70 | (which is what SWAPGS modifies): | ||
| 71 | |||
| 72 | movl $1,%ebx | ||
| 73 | movl $MSR_GS_BASE,%ecx | ||
| 74 | rdmsr | ||
| 75 | testl %edx,%edx | ||
| 76 | js 1f /* negative -> in kernel */ | ||
| 77 | SWAPGS | ||
| 78 | xorl %ebx,%ebx | ||
| 79 | 1: ret | ||
| 80 | |||
| 81 | and the whole paranoid non-paranoid macro complexity is about whether | ||
| 82 | to suffer that RDMSR cost. | ||
| 83 | |||
| 84 | If we are at an interrupt or user-trap/gate-alike boundary then we can | ||
| 85 | use the faster check: the stack will be a reliable indicator of | ||
| 86 | whether SWAPGS was already done: if we see that we are a secondary | ||
| 87 | entry interrupting kernel mode execution, then we know that the GS | ||
| 88 | base has already been switched. If it says that we interrupted | ||
| 89 | user-space execution then we must do the SWAPGS. | ||
| 90 | |||
| 91 | But if we are in an NMI/MCE/DEBUG/whatever super-atomic entry context, | ||
| 92 | which might have triggered right after a normal entry wrote CS to the | ||
| 93 | stack but before we executed SWAPGS, then the only safe way to check | ||
| 94 | for GS is the slower method: the RDMSR. | ||
| 95 | |||
| 96 | So we try only to mark those entry methods 'paranoid' that absolutely | ||
| 97 | need the more expensive check for the GS base - and we generate all | ||
| 98 | 'normal' entry points with the regular (faster) entry macros. | ||
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 7336ba653b8f..137b277f7e56 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig | |||
| @@ -101,6 +101,9 @@ config GENERIC_IOMAP | |||
| 101 | bool | 101 | bool |
| 102 | default y | 102 | default y |
| 103 | 103 | ||
| 104 | config ARCH_CLOCKSOURCE_DATA | ||
| 105 | def_bool y | ||
| 106 | |||
| 104 | config SCHED_OMIT_FRAME_POINTER | 107 | config SCHED_OMIT_FRAME_POINTER |
| 105 | bool | 108 | bool |
| 106 | default y | 109 | default y |
diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h new file mode 100644 index 000000000000..5c8596e4cb02 --- /dev/null +++ b/arch/ia64/include/asm/clocksource.h | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | /* IA64-specific clocksource additions */ | ||
| 2 | |||
| 3 | #ifndef _ASM_IA64_CLOCKSOURCE_H | ||
| 4 | #define _ASM_IA64_CLOCKSOURCE_H | ||
| 5 | |||
| 6 | struct arch_clocksource_data { | ||
| 7 | void *fsys_mmio; /* used by fsyscall asm code */ | ||
| 8 | }; | ||
| 9 | |||
| 10 | #endif /* _ASM_IA64_CLOCKSOURCE_H */ | ||
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c index f64097b5118a..4826ff957a3d 100644 --- a/arch/ia64/kernel/cyclone.c +++ b/arch/ia64/kernel/cyclone.c | |||
| @@ -115,7 +115,7 @@ int __init init_cyclone_clock(void) | |||
| 115 | } | 115 | } |
| 116 | /* initialize last tick */ | 116 | /* initialize last tick */ |
| 117 | cyclone_mc = cyclone_timer; | 117 | cyclone_mc = cyclone_timer; |
| 118 | clocksource_cyclone.fsys_mmio = cyclone_timer; | 118 | clocksource_cyclone.archdata.fsys_mmio = cyclone_timer; |
| 119 | clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); | 119 | clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); |
| 120 | 120 | ||
| 121 | return 0; | 121 | return 0; |
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 85118dfe9bb5..43920de425f1 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c | |||
| @@ -468,7 +468,7 @@ void update_vsyscall(struct timespec *wall, struct timespec *wtm, | |||
| 468 | fsyscall_gtod_data.clk_mask = c->mask; | 468 | fsyscall_gtod_data.clk_mask = c->mask; |
| 469 | fsyscall_gtod_data.clk_mult = mult; | 469 | fsyscall_gtod_data.clk_mult = mult; |
| 470 | fsyscall_gtod_data.clk_shift = c->shift; | 470 | fsyscall_gtod_data.clk_shift = c->shift; |
| 471 | fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio; | 471 | fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio; |
| 472 | fsyscall_gtod_data.clk_cycle_last = c->cycle_last; | 472 | fsyscall_gtod_data.clk_cycle_last = c->cycle_last; |
| 473 | 473 | ||
| 474 | /* copy kernel time structures */ | 474 | /* copy kernel time structures */ |
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c index c34efda122e1..0f8844e49363 100644 --- a/arch/ia64/sn/kernel/sn2/timer.c +++ b/arch/ia64/sn/kernel/sn2/timer.c | |||
| @@ -54,7 +54,7 @@ ia64_sn_udelay (unsigned long usecs) | |||
| 54 | 54 | ||
| 55 | void __init sn_timer_init(void) | 55 | void __init sn_timer_init(void) |
| 56 | { | 56 | { |
| 57 | clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR; | 57 | clocksource_sn2.archdata.fsys_mmio = RTC_COUNTER_ADDR; |
| 58 | clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second); | 58 | clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second); |
| 59 | 59 | ||
| 60 | ia64_udelay = &ia64_sn_udelay; | 60 | ia64_udelay = &ia64_sn_udelay; |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc76e4209003..5f60ea190d5b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -95,6 +95,10 @@ config CLOCKSOURCE_WATCHDOG | |||
| 95 | config GENERIC_CLOCKEVENTS | 95 | config GENERIC_CLOCKEVENTS |
| 96 | def_bool y | 96 | def_bool y |
| 97 | 97 | ||
| 98 | config ARCH_CLOCKSOURCE_DATA | ||
| 99 | def_bool y | ||
| 100 | depends on X86_64 | ||
| 101 | |||
| 98 | config GENERIC_CLOCKEVENTS_BROADCAST | 102 | config GENERIC_CLOCKEVENTS_BROADCAST |
| 99 | def_bool y | 103 | def_bool y |
| 100 | depends on X86_64 || (X86_32 && X86_LOCAL_APIC) | 104 | depends on X86_64 || (X86_32 && X86_LOCAL_APIC) |
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 94d420b360d1..4554cc6fb96a 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h | |||
| @@ -17,8 +17,8 @@ | |||
| 17 | 17 | ||
| 18 | .macro altinstruction_entry orig alt feature orig_len alt_len | 18 | .macro altinstruction_entry orig alt feature orig_len alt_len |
| 19 | .align 8 | 19 | .align 8 |
| 20 | .quad \orig | 20 | .long \orig - . |
| 21 | .quad \alt | 21 | .long \alt - . |
| 22 | .word \feature | 22 | .word \feature |
| 23 | .byte \orig_len | 23 | .byte \orig_len |
| 24 | .byte \alt_len | 24 | .byte \alt_len |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index bf535f947e8c..23fb6d79f209 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
| @@ -43,8 +43,8 @@ | |||
| 43 | #endif | 43 | #endif |
| 44 | 44 | ||
| 45 | struct alt_instr { | 45 | struct alt_instr { |
| 46 | u8 *instr; /* original instruction */ | 46 | s32 instr_offset; /* original instruction */ |
| 47 | u8 *replacement; | 47 | s32 repl_offset; /* offset to replacement instruction */ |
| 48 | u16 cpuid; /* cpuid bit set for replacement */ | 48 | u16 cpuid; /* cpuid bit set for replacement */ |
| 49 | u8 instrlen; /* length of original instruction */ | 49 | u8 instrlen; /* length of original instruction */ |
| 50 | u8 replacementlen; /* length of new instruction, <= instrlen */ | 50 | u8 replacementlen; /* length of new instruction, <= instrlen */ |
| @@ -84,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
| 84 | "661:\n\t" oldinstr "\n662:\n" \ | 84 | "661:\n\t" oldinstr "\n662:\n" \ |
| 85 | ".section .altinstructions,\"a\"\n" \ | 85 | ".section .altinstructions,\"a\"\n" \ |
| 86 | _ASM_ALIGN "\n" \ | 86 | _ASM_ALIGN "\n" \ |
| 87 | _ASM_PTR "661b\n" /* label */ \ | 87 | " .long 661b - .\n" /* label */ \ |
| 88 | _ASM_PTR "663f\n" /* new instruction */ \ | 88 | " .long 663f - .\n" /* new instruction */ \ |
| 89 | " .word " __stringify(feature) "\n" /* feature bit */ \ | 89 | " .word " __stringify(feature) "\n" /* feature bit */ \ |
| 90 | " .byte 662b-661b\n" /* sourcelen */ \ | 90 | " .byte 662b-661b\n" /* sourcelen */ \ |
| 91 | " .byte 664f-663f\n" /* replacementlen */ \ | 91 | " .byte 664f-663f\n" /* replacementlen */ \ |
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h new file mode 100644 index 000000000000..0bdbbb3b9ce7 --- /dev/null +++ b/arch/x86/include/asm/clocksource.h | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | /* x86-specific clocksource additions */ | ||
| 2 | |||
| 3 | #ifndef _ASM_X86_CLOCKSOURCE_H | ||
| 4 | #define _ASM_X86_CLOCKSOURCE_H | ||
| 5 | |||
| 6 | #ifdef CONFIG_X86_64 | ||
| 7 | |||
| 8 | #define VCLOCK_NONE 0 /* No vDSO clock available. */ | ||
| 9 | #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ | ||
| 10 | #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ | ||
| 11 | |||
| 12 | struct arch_clocksource_data { | ||
| 13 | int vclock_mode; | ||
| 14 | }; | ||
| 15 | |||
| 16 | #endif /* CONFIG_X86_64 */ | ||
| 17 | |||
| 18 | #endif /* _ASM_X86_CLOCKSOURCE_H */ | ||
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 71cc3800712c..9929b35929ff 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
| @@ -331,8 +331,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
| 331 | "2:\n" | 331 | "2:\n" |
| 332 | ".section .altinstructions,\"a\"\n" | 332 | ".section .altinstructions,\"a\"\n" |
| 333 | _ASM_ALIGN "\n" | 333 | _ASM_ALIGN "\n" |
| 334 | _ASM_PTR "1b\n" | 334 | " .long 1b - .\n" |
| 335 | _ASM_PTR "0\n" /* no replacement */ | 335 | " .long 0\n" /* no replacement */ |
| 336 | " .word %P0\n" /* feature bit */ | 336 | " .word %P0\n" /* feature bit */ |
| 337 | " .byte 2b - 1b\n" /* source len */ | 337 | " .byte 2b - 1b\n" /* source len */ |
| 338 | " .byte 0\n" /* replacement len */ | 338 | " .byte 0\n" /* replacement len */ |
| @@ -349,8 +349,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
| 349 | "2:\n" | 349 | "2:\n" |
| 350 | ".section .altinstructions,\"a\"\n" | 350 | ".section .altinstructions,\"a\"\n" |
| 351 | _ASM_ALIGN "\n" | 351 | _ASM_ALIGN "\n" |
| 352 | _ASM_PTR "1b\n" | 352 | " .long 1b - .\n" |
| 353 | _ASM_PTR "3f\n" | 353 | " .long 3f - .\n" |
| 354 | " .word %P1\n" /* feature bit */ | 354 | " .word %P1\n" /* feature bit */ |
| 355 | " .byte 2b - 1b\n" /* source len */ | 355 | " .byte 2b - 1b\n" /* source len */ |
| 356 | " .byte 4f - 3f\n" /* replacement len */ | 356 | " .byte 4f - 3f\n" /* replacement len */ |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4729b2b63117..460c74e4852c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
| @@ -78,6 +78,7 @@ enum fixed_addresses { | |||
| 78 | VSYSCALL_LAST_PAGE, | 78 | VSYSCALL_LAST_PAGE, |
| 79 | VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE | 79 | VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE |
| 80 | + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, | 80 | + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, |
| 81 | VVAR_PAGE, | ||
| 81 | VSYSCALL_HPET, | 82 | VSYSCALL_HPET, |
| 82 | #endif | 83 | #endif |
| 83 | FIX_DBGP_BASE, | 84 | FIX_DBGP_BASE, |
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6665026ea3ea..f9a320984a10 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
| @@ -17,7 +17,8 @@ | |||
| 17 | * Vectors 0 ... 31 : system traps and exceptions - hardcoded events | 17 | * Vectors 0 ... 31 : system traps and exceptions - hardcoded events |
| 18 | * Vectors 32 ... 127 : device interrupts | 18 | * Vectors 32 ... 127 : device interrupts |
| 19 | * Vector 128 : legacy int80 syscall interface | 19 | * Vector 128 : legacy int80 syscall interface |
| 20 | * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts | 20 | * Vector 204 : legacy x86_64 vsyscall emulation |
| 21 | * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts | ||
| 21 | * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts | 22 | * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts |
| 22 | * | 23 | * |
| 23 | * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. | 24 | * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. |
| @@ -50,6 +51,9 @@ | |||
| 50 | #ifdef CONFIG_X86_32 | 51 | #ifdef CONFIG_X86_32 |
| 51 | # define SYSCALL_VECTOR 0x80 | 52 | # define SYSCALL_VECTOR 0x80 |
| 52 | #endif | 53 | #endif |
| 54 | #ifdef CONFIG_X86_64 | ||
| 55 | # define VSYSCALL_EMU_VECTOR 0xcc | ||
| 56 | #endif | ||
| 53 | 57 | ||
| 54 | /* | 58 | /* |
| 55 | * Vectors 0x30-0x3f are used for ISA interrupts. | 59 | * Vectors 0x30-0x3f are used for ISA interrupts. |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d56187c6b838..013286a10c2c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
| @@ -107,7 +107,8 @@ | |||
| 107 | #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) | 107 | #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) |
| 108 | #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) | 108 | #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) |
| 109 | #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) | 109 | #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) |
| 110 | #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) | 110 | #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) |
| 111 | #define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) | ||
| 111 | #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) | 112 | #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) |
| 112 | #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) | 113 | #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) |
| 113 | #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) | 114 | #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) |
| @@ -129,7 +130,8 @@ | |||
| 129 | #define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) | 130 | #define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) |
| 130 | #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) | 131 | #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) |
| 131 | #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) | 132 | #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) |
| 132 | #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) | 133 | #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) |
| 134 | #define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) | ||
| 133 | 135 | ||
| 134 | #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) | 136 | #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) |
| 135 | #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) | 137 | #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0310da67307f..2bae0a513b40 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | #ifndef _ASM_X86_TRAPS_H | 1 | #ifndef _ASM_X86_TRAPS_H |
| 2 | #define _ASM_X86_TRAPS_H | 2 | #define _ASM_X86_TRAPS_H |
| 3 | 3 | ||
| 4 | #include <linux/kprobes.h> | ||
| 5 | |||
| 4 | #include <asm/debugreg.h> | 6 | #include <asm/debugreg.h> |
| 5 | #include <asm/siginfo.h> /* TRAP_TRACE, ... */ | 7 | #include <asm/siginfo.h> /* TRAP_TRACE, ... */ |
| 6 | 8 | ||
| @@ -38,6 +40,7 @@ asmlinkage void alignment_check(void); | |||
| 38 | asmlinkage void machine_check(void); | 40 | asmlinkage void machine_check(void); |
| 39 | #endif /* CONFIG_X86_MCE */ | 41 | #endif /* CONFIG_X86_MCE */ |
| 40 | asmlinkage void simd_coprocessor_error(void); | 42 | asmlinkage void simd_coprocessor_error(void); |
| 43 | asmlinkage void emulate_vsyscall(void); | ||
| 41 | 44 | ||
| 42 | dotraplinkage void do_divide_error(struct pt_regs *, long); | 45 | dotraplinkage void do_divide_error(struct pt_regs *, long); |
| 43 | dotraplinkage void do_debug(struct pt_regs *, long); | 46 | dotraplinkage void do_debug(struct pt_regs *, long); |
| @@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); | |||
| 64 | dotraplinkage void do_machine_check(struct pt_regs *, long); | 67 | dotraplinkage void do_machine_check(struct pt_regs *, long); |
| 65 | #endif | 68 | #endif |
| 66 | dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); | 69 | dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); |
| 70 | dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); | ||
| 67 | #ifdef CONFIG_X86_32 | 71 | #ifdef CONFIG_X86_32 |
| 68 | dotraplinkage void do_iret_error(struct pt_regs *, long); | 72 | dotraplinkage void do_iret_error(struct pt_regs *, long); |
| 69 | #endif | 73 | #endif |
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 9db5583b6d38..83e2efd181e2 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h | |||
| @@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void); | |||
| 51 | extern int check_tsc_unstable(void); | 51 | extern int check_tsc_unstable(void); |
| 52 | extern unsigned long native_calibrate_tsc(void); | 52 | extern unsigned long native_calibrate_tsc(void); |
| 53 | 53 | ||
| 54 | #ifdef CONFIG_X86_64 | ||
| 55 | extern cycles_t vread_tsc(void); | ||
| 56 | #endif | ||
| 57 | |||
| 58 | /* | 54 | /* |
| 59 | * Boot-time check whether the TSCs are synchronized across | 55 | * Boot-time check whether the TSCs are synchronized across |
| 60 | * all CPUs/cores: | 56 | * all CPUs/cores: |
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 646b4c1ca695..815285bcaceb 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h | |||
| @@ -11,10 +11,9 @@ struct vsyscall_gtod_data { | |||
| 11 | time_t wall_time_sec; | 11 | time_t wall_time_sec; |
| 12 | u32 wall_time_nsec; | 12 | u32 wall_time_nsec; |
| 13 | 13 | ||
| 14 | int sysctl_enabled; | ||
| 15 | struct timezone sys_tz; | 14 | struct timezone sys_tz; |
| 16 | struct { /* extract of a clocksource struct */ | 15 | struct { /* extract of a clocksource struct */ |
| 17 | cycle_t (*vread)(void); | 16 | int vclock_mode; |
| 18 | cycle_t cycle_last; | 17 | cycle_t cycle_last; |
| 19 | cycle_t mask; | 18 | cycle_t mask; |
| 20 | u32 mult; | 19 | u32 mult; |
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d55597351f6a..60107072c28b 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h | |||
| @@ -16,10 +16,6 @@ enum vsyscall_num { | |||
| 16 | #ifdef __KERNEL__ | 16 | #ifdef __KERNEL__ |
| 17 | #include <linux/seqlock.h> | 17 | #include <linux/seqlock.h> |
| 18 | 18 | ||
| 19 | /* Definitions for CONFIG_GENERIC_TIME definitions */ | ||
| 20 | #define __vsyscall_fn \ | ||
| 21 | __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace | ||
| 22 | |||
| 23 | #define VGETCPU_RDTSCP 1 | 19 | #define VGETCPU_RDTSCP 1 |
| 24 | #define VGETCPU_LSL 2 | 20 | #define VGETCPU_LSL 2 |
| 25 | 21 | ||
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 341b3559452b..de656ac2af41 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h | |||
| @@ -10,15 +10,14 @@ | |||
| 10 | * In normal kernel code, they are used like any other variable. | 10 | * In normal kernel code, they are used like any other variable. |
| 11 | * In user code, they are accessed through the VVAR macro. | 11 | * In user code, they are accessed through the VVAR macro. |
| 12 | * | 12 | * |
| 13 | * Each of these variables lives in the vsyscall page, and each | 13 | * These variables live in a page of kernel data that has an extra RO |
| 14 | * one needs a unique offset within the little piece of the page | 14 | * mapping for userspace. Each variable needs a unique offset within |
| 15 | * reserved for vvars. Specify that offset in DECLARE_VVAR. | 15 | * that page; specify that offset with the DECLARE_VVAR macro. (If |
| 16 | * (There are 896 bytes available. If you mess up, the linker will | 16 | * you mess up, the linker will catch it.) |
| 17 | * catch it.) | ||
| 18 | */ | 17 | */ |
| 19 | 18 | ||
| 20 | /* Offset of vars within vsyscall page */ | 19 | /* Base address of vvars. This is not ABI. */ |
| 21 | #define VSYSCALL_VARS_OFFSET (3072 + 128) | 20 | #define VVAR_ADDRESS (-10*1024*1024 - 4096) |
| 22 | 21 | ||
| 23 | #if defined(__VVAR_KERNEL_LDS) | 22 | #if defined(__VVAR_KERNEL_LDS) |
| 24 | 23 | ||
| @@ -26,17 +25,17 @@ | |||
| 26 | * right place. | 25 | * right place. |
| 27 | */ | 26 | */ |
| 28 | #define DECLARE_VVAR(offset, type, name) \ | 27 | #define DECLARE_VVAR(offset, type, name) \ |
| 29 | EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) | 28 | EMIT_VVAR(name, offset) |
| 30 | 29 | ||
| 31 | #else | 30 | #else |
| 32 | 31 | ||
| 33 | #define DECLARE_VVAR(offset, type, name) \ | 32 | #define DECLARE_VVAR(offset, type, name) \ |
| 34 | static type const * const vvaraddr_ ## name = \ | 33 | static type const * const vvaraddr_ ## name = \ |
| 35 | (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); | 34 | (void *)(VVAR_ADDRESS + (offset)); |
| 36 | 35 | ||
| 37 | #define DEFINE_VVAR(type, name) \ | 36 | #define DEFINE_VVAR(type, name) \ |
| 38 | type __vvar_ ## name \ | 37 | type name \ |
| 39 | __attribute__((section(".vsyscall_var_" #name), aligned(16))) | 38 | __attribute__((section(".vvar_" #name), aligned(16))) |
| 40 | 39 | ||
| 41 | #define VVAR(name) (*vvaraddr_ ## name) | 40 | #define VVAR(name) (*vvaraddr_ ## name) |
| 42 | 41 | ||
| @@ -45,8 +44,7 @@ | |||
| 45 | /* DECLARE_VVAR(offset, type, name) */ | 44 | /* DECLARE_VVAR(offset, type, name) */ |
| 46 | 45 | ||
| 47 | DECLARE_VVAR(0, volatile unsigned long, jiffies) | 46 | DECLARE_VVAR(0, volatile unsigned long, jiffies) |
| 48 | DECLARE_VVAR(8, int, vgetcpu_mode) | 47 | DECLARE_VVAR(16, int, vgetcpu_mode) |
| 49 | DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) | 48 | DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) |
| 50 | 49 | ||
| 51 | #undef DECLARE_VVAR | 50 | #undef DECLARE_VVAR |
| 52 | #undef VSYSCALL_VARS_OFFSET | ||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 11817ff85399..04105574c8e9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
| @@ -24,17 +24,12 @@ endif | |||
| 24 | nostackp := $(call cc-option, -fno-stack-protector) | 24 | nostackp := $(call cc-option, -fno-stack-protector) |
| 25 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) | 25 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) |
| 26 | CFLAGS_hpet.o := $(nostackp) | 26 | CFLAGS_hpet.o := $(nostackp) |
| 27 | CFLAGS_vread_tsc_64.o := $(nostackp) | ||
| 28 | CFLAGS_paravirt.o := $(nostackp) | 27 | CFLAGS_paravirt.o := $(nostackp) |
| 29 | GCOV_PROFILE_vsyscall_64.o := n | 28 | GCOV_PROFILE_vsyscall_64.o := n |
| 30 | GCOV_PROFILE_hpet.o := n | 29 | GCOV_PROFILE_hpet.o := n |
| 31 | GCOV_PROFILE_tsc.o := n | 30 | GCOV_PROFILE_tsc.o := n |
| 32 | GCOV_PROFILE_vread_tsc_64.o := n | ||
| 33 | GCOV_PROFILE_paravirt.o := n | 31 | GCOV_PROFILE_paravirt.o := n |
| 34 | 32 | ||
| 35 | # vread_tsc_64 is hot and should be fully optimized: | ||
| 36 | CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls | ||
| 37 | |||
| 38 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o | 33 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o |
| 39 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o | 34 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
| 40 | obj-y += time.o ioport.o ldt.o dumpstack.o | 35 | obj-y += time.o ioport.o ldt.o dumpstack.o |
| @@ -43,7 +38,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o | |||
| 43 | obj-y += probe_roms.o | 38 | obj-y += probe_roms.o |
| 44 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o | 39 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o |
| 45 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 40 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
| 46 | obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o | 41 | obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o |
| 42 | obj-$(CONFIG_X86_64) += vsyscall_emu_64.o | ||
| 47 | obj-y += bootflag.o e820.o | 43 | obj-y += bootflag.o e820.o |
| 48 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o | 44 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o |
| 49 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o | 45 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a81f2d52f869..c63822816249 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
| @@ -14,7 +14,6 @@ | |||
| 14 | #include <asm/pgtable.h> | 14 | #include <asm/pgtable.h> |
| 15 | #include <asm/mce.h> | 15 | #include <asm/mce.h> |
| 16 | #include <asm/nmi.h> | 16 | #include <asm/nmi.h> |
| 17 | #include <asm/vsyscall.h> | ||
| 18 | #include <asm/cacheflush.h> | 17 | #include <asm/cacheflush.h> |
| 19 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
| 20 | #include <asm/io.h> | 19 | #include <asm/io.h> |
| @@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len) | |||
| 250 | 249 | ||
| 251 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | 250 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
| 252 | extern s32 __smp_locks[], __smp_locks_end[]; | 251 | extern s32 __smp_locks[], __smp_locks_end[]; |
| 253 | extern char __vsyscall_0; | ||
| 254 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 252 | void *text_poke_early(void *addr, const void *opcode, size_t len); |
| 255 | 253 | ||
| 256 | /* Replace instructions with better alternatives for this CPU type. | 254 | /* Replace instructions with better alternatives for this CPU type. |
| @@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
| 263 | struct alt_instr *end) | 261 | struct alt_instr *end) |
| 264 | { | 262 | { |
| 265 | struct alt_instr *a; | 263 | struct alt_instr *a; |
| 264 | u8 *instr, *replacement; | ||
| 266 | u8 insnbuf[MAX_PATCH_LEN]; | 265 | u8 insnbuf[MAX_PATCH_LEN]; |
| 267 | 266 | ||
| 268 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 267 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); |
| @@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
| 276 | * order. | 275 | * order. |
| 277 | */ | 276 | */ |
| 278 | for (a = start; a < end; a++) { | 277 | for (a = start; a < end; a++) { |
| 279 | u8 *instr = a->instr; | 278 | instr = (u8 *)&a->instr_offset + a->instr_offset; |
| 279 | replacement = (u8 *)&a->repl_offset + a->repl_offset; | ||
| 280 | BUG_ON(a->replacementlen > a->instrlen); | 280 | BUG_ON(a->replacementlen > a->instrlen); |
| 281 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 281 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
| 282 | BUG_ON(a->cpuid >= NCAPINTS*32); | 282 | BUG_ON(a->cpuid >= NCAPINTS*32); |
| 283 | if (!boot_cpu_has(a->cpuid)) | 283 | if (!boot_cpu_has(a->cpuid)) |
| 284 | continue; | 284 | continue; |
| 285 | #ifdef CONFIG_X86_64 | 285 | |
| 286 | /* vsyscall code is not mapped yet. resolve it manually. */ | 286 | memcpy(insnbuf, replacement, a->replacementlen); |
| 287 | if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { | 287 | |
| 288 | instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); | 288 | /* 0xe8 is a relative jump; fix the offset. */ |
| 289 | DPRINTK("%s: vsyscall fixup: %p => %p\n", | ||
| 290 | __func__, a->instr, instr); | ||
| 291 | } | ||
| 292 | #endif | ||
| 293 | memcpy(insnbuf, a->replacement, a->replacementlen); | ||
| 294 | if (*insnbuf == 0xe8 && a->replacementlen == 5) | 289 | if (*insnbuf == 0xe8 && a->replacementlen == 5) |
| 295 | *(s32 *)(insnbuf + 1) += a->replacement - a->instr; | 290 | *(s32 *)(insnbuf + 1) += replacement - instr; |
| 291 | |||
| 296 | add_nops(insnbuf + a->replacementlen, | 292 | add_nops(insnbuf + a->replacementlen, |
| 297 | a->instrlen - a->replacementlen); | 293 | a->instrlen - a->replacementlen); |
| 294 | |||
| 298 | text_poke_early(instr, insnbuf, a->instrlen); | 295 | text_poke_early(instr, insnbuf, a->instrlen); |
| 299 | } | 296 | } |
| 300 | } | 297 | } |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 37e895a1c74d..e13329d800c8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -9,6 +9,8 @@ | |||
| 9 | /* | 9 | /* |
| 10 | * entry.S contains the system-call and fault low-level handling routines. | 10 | * entry.S contains the system-call and fault low-level handling routines. |
| 11 | * | 11 | * |
| 12 | * Some of this is documented in Documentation/x86/entry_64.txt | ||
| 13 | * | ||
| 12 | * NOTE: This code handles signal-recognition, which happens every time | 14 | * NOTE: This code handles signal-recognition, which happens every time |
| 13 | * after an interrupt and after each system call. | 15 | * after an interrupt and after each system call. |
| 14 | * | 16 | * |
| @@ -1109,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug | |||
| 1109 | zeroentry coprocessor_error do_coprocessor_error | 1111 | zeroentry coprocessor_error do_coprocessor_error |
| 1110 | errorentry alignment_check do_alignment_check | 1112 | errorentry alignment_check do_alignment_check |
| 1111 | zeroentry simd_coprocessor_error do_simd_coprocessor_error | 1113 | zeroentry simd_coprocessor_error do_simd_coprocessor_error |
| 1114 | zeroentry emulate_vsyscall do_emulate_vsyscall | ||
| 1115 | |||
| 1112 | 1116 | ||
| 1113 | /* Reload gs selector with exception handling */ | 1117 | /* Reload gs selector with exception handling */ |
| 1114 | /* edi: new selector */ | 1118 | /* edi: new selector */ |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0f4b0651cd3f..4aecc54236a9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
| @@ -72,7 +72,7 @@ static inline void hpet_set_mapping(void) | |||
| 72 | { | 72 | { |
| 73 | hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); | 73 | hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); |
| 74 | #ifdef CONFIG_X86_64 | 74 | #ifdef CONFIG_X86_64 |
| 75 | __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | 75 | __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); |
| 76 | #endif | 76 | #endif |
| 77 | } | 77 | } |
| 78 | 78 | ||
| @@ -739,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs) | |||
| 739 | return (cycle_t)hpet_readl(HPET_COUNTER); | 739 | return (cycle_t)hpet_readl(HPET_COUNTER); |
| 740 | } | 740 | } |
| 741 | 741 | ||
| 742 | #ifdef CONFIG_X86_64 | ||
| 743 | static cycle_t __vsyscall_fn vread_hpet(void) | ||
| 744 | { | ||
| 745 | return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); | ||
| 746 | } | ||
| 747 | #endif | ||
| 748 | |||
| 749 | static struct clocksource clocksource_hpet = { | 742 | static struct clocksource clocksource_hpet = { |
| 750 | .name = "hpet", | 743 | .name = "hpet", |
| 751 | .rating = 250, | 744 | .rating = 250, |
| @@ -754,7 +747,7 @@ static struct clocksource clocksource_hpet = { | |||
| 754 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 747 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
| 755 | .resume = hpet_resume_counter, | 748 | .resume = hpet_resume_counter, |
| 756 | #ifdef CONFIG_X86_64 | 749 | #ifdef CONFIG_X86_64 |
| 757 | .vread = vread_hpet, | 750 | .archdata = { .vclock_mode = VCLOCK_HPET }, |
| 758 | #endif | 751 | #endif |
| 759 | }; | 752 | }; |
| 760 | 753 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b9b67166f9de..fbc097a085ca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
| @@ -872,6 +872,12 @@ void __init trap_init(void) | |||
| 872 | set_bit(SYSCALL_VECTOR, used_vectors); | 872 | set_bit(SYSCALL_VECTOR, used_vectors); |
| 873 | #endif | 873 | #endif |
| 874 | 874 | ||
| 875 | #ifdef CONFIG_X86_64 | ||
| 876 | BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); | ||
| 877 | set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); | ||
| 878 | set_bit(VSYSCALL_EMU_VECTOR, used_vectors); | ||
| 879 | #endif | ||
| 880 | |||
| 875 | /* | 881 | /* |
| 876 | * Should be a barrier for any external CPU state: | 882 | * Should be a barrier for any external CPU state: |
| 877 | */ | 883 | */ |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6cc6922262af..56c633a5db72 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
| @@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = { | |||
| 777 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | 777 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | |
| 778 | CLOCK_SOURCE_MUST_VERIFY, | 778 | CLOCK_SOURCE_MUST_VERIFY, |
| 779 | #ifdef CONFIG_X86_64 | 779 | #ifdef CONFIG_X86_64 |
| 780 | .vread = vread_tsc, | 780 | .archdata = { .vclock_mode = VCLOCK_TSC }, |
| 781 | #endif | 781 | #endif |
| 782 | }; | 782 | }; |
| 783 | 783 | ||
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 89aed99aafce..4aa9c54a9b76 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
| @@ -161,50 +161,47 @@ SECTIONS | |||
| 161 | 161 | ||
| 162 | #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) | 162 | #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) |
| 163 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | 163 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) |
| 164 | #define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ | ||
| 165 | ADDR(.vsyscall_0) + offset \ | ||
| 166 | : AT(VLOAD(.vsyscall_var_ ## x)) { \ | ||
| 167 | *(.vsyscall_var_ ## x) \ | ||
| 168 | } \ | ||
| 169 | x = VVIRT(.vsyscall_var_ ## x); | ||
| 170 | 164 | ||
| 171 | . = ALIGN(4096); | 165 | . = ALIGN(4096); |
| 172 | __vsyscall_0 = .; | 166 | __vsyscall_0 = .; |
| 173 | 167 | ||
| 174 | . = VSYSCALL_ADDR; | 168 | . = VSYSCALL_ADDR; |
| 175 | .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { | 169 | .vsyscall : AT(VLOAD(.vsyscall)) { |
| 176 | *(.vsyscall_0) | 170 | *(.vsyscall_0) |
| 177 | } :user | ||
| 178 | 171 | ||
| 179 | . = ALIGN(L1_CACHE_BYTES); | 172 | . = 1024; |
| 180 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { | ||
| 181 | *(.vsyscall_fn) | ||
| 182 | } | ||
| 183 | |||
| 184 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { | ||
| 185 | *(.vsyscall_1) | 173 | *(.vsyscall_1) |
| 186 | } | ||
| 187 | .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { | ||
| 188 | *(.vsyscall_2) | ||
| 189 | } | ||
| 190 | 174 | ||
| 191 | .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { | 175 | . = 2048; |
| 192 | *(.vsyscall_3) | 176 | *(.vsyscall_2) |
| 193 | } | ||
| 194 | |||
| 195 | #define __VVAR_KERNEL_LDS | ||
| 196 | #include <asm/vvar.h> | ||
| 197 | #undef __VVAR_KERNEL_LDS | ||
| 198 | 177 | ||
| 199 | . = __vsyscall_0 + PAGE_SIZE; | 178 | . = 4096; /* Pad the whole page. */ |
| 179 | } :user =0xcc | ||
| 180 | . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); | ||
| 200 | 181 | ||
| 201 | #undef VSYSCALL_ADDR | 182 | #undef VSYSCALL_ADDR |
| 202 | #undef VLOAD_OFFSET | 183 | #undef VLOAD_OFFSET |
| 203 | #undef VLOAD | 184 | #undef VLOAD |
| 204 | #undef VVIRT_OFFSET | 185 | #undef VVIRT_OFFSET |
| 205 | #undef VVIRT | 186 | #undef VVIRT |
| 187 | |||
| 188 | __vvar_page = .; | ||
| 189 | |||
| 190 | .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { | ||
| 191 | |||
| 192 | /* Place all vvars at the offsets in asm/vvar.h. */ | ||
| 193 | #define EMIT_VVAR(name, offset) \ | ||
| 194 | . = offset; \ | ||
| 195 | *(.vvar_ ## name) | ||
| 196 | #define __VVAR_KERNEL_LDS | ||
| 197 | #include <asm/vvar.h> | ||
| 198 | #undef __VVAR_KERNEL_LDS | ||
| 206 | #undef EMIT_VVAR | 199 | #undef EMIT_VVAR |
| 207 | 200 | ||
| 201 | } :data | ||
| 202 | |||
| 203 | . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); | ||
| 204 | |||
| 208 | #endif /* CONFIG_X86_64 */ | 205 | #endif /* CONFIG_X86_64 */ |
| 209 | 206 | ||
| 210 | /* Init code and data - will be freed after init */ | 207 | /* Init code and data - will be freed after init */ |
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c deleted file mode 100644 index a81aa9e9894c..000000000000 --- a/arch/x86/kernel/vread_tsc_64.c +++ /dev/null | |||
| @@ -1,36 +0,0 @@ | |||
| 1 | /* This code runs in userspace. */ | ||
| 2 | |||
| 3 | #define DISABLE_BRANCH_PROFILING | ||
| 4 | #include <asm/vgtod.h> | ||
| 5 | |||
| 6 | notrace cycle_t __vsyscall_fn vread_tsc(void) | ||
| 7 | { | ||
| 8 | cycle_t ret; | ||
| 9 | u64 last; | ||
| 10 | |||
| 11 | /* | ||
| 12 | * Empirically, a fence (of type that depends on the CPU) | ||
| 13 | * before rdtsc is enough to ensure that rdtsc is ordered | ||
| 14 | * with respect to loads. The various CPU manuals are unclear | ||
| 15 | * as to whether rdtsc can be reordered with later loads, | ||
| 16 | * but no one has ever seen it happen. | ||
| 17 | */ | ||
| 18 | rdtsc_barrier(); | ||
| 19 | ret = (cycle_t)vget_cycles(); | ||
| 20 | |||
| 21 | last = VVAR(vsyscall_gtod_data).clock.cycle_last; | ||
| 22 | |||
| 23 | if (likely(ret >= last)) | ||
| 24 | return ret; | ||
| 25 | |||
| 26 | /* | ||
| 27 | * GCC likes to generate cmov here, but this branch is extremely | ||
| 28 | * predictable (it's just a funciton of time and the likely is | ||
| 29 | * very likely) and there's a data dependence, so force GCC | ||
| 30 | * to generate a branch instead. I don't barrier() because | ||
| 31 | * we don't actually need a barrier, and if this function | ||
| 32 | * ever gets inlined it will generate worse code. | ||
| 33 | */ | ||
| 34 | asm volatile (""); | ||
| 35 | return last; | ||
| 36 | } | ||
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3e682184d76c..dda7dff9cef7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
| @@ -2,6 +2,8 @@ | |||
| 2 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | 2 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
| 3 | * Copyright 2003 Andi Kleen, SuSE Labs. | 3 | * Copyright 2003 Andi Kleen, SuSE Labs. |
| 4 | * | 4 | * |
| 5 | * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] | ||
| 6 | * | ||
| 5 | * Thanks to hpa@transmeta.com for some useful hint. | 7 | * Thanks to hpa@transmeta.com for some useful hint. |
| 6 | * Special thanks to Ingo Molnar for his early experience with | 8 | * Special thanks to Ingo Molnar for his early experience with |
| 7 | * a different vsyscall implementation for Linux/IA32 and for the name. | 9 | * a different vsyscall implementation for Linux/IA32 and for the name. |
| @@ -11,10 +13,9 @@ | |||
| 11 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | 13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid |
| 12 | * jumping out of line if necessary. We cannot add more with this | 14 | * jumping out of line if necessary. We cannot add more with this |
| 13 | * mechanism because older kernels won't return -ENOSYS. | 15 | * mechanism because older kernels won't return -ENOSYS. |
| 14 | * If we want more than four we need a vDSO. | ||
| 15 | * | 16 | * |
| 16 | * Note: the concept clashes with user mode linux. If you use UML and | 17 | * Note: the concept clashes with user mode linux. UML users should |
| 17 | * want per guest time just set the kernel.vsyscall64 sysctl to 0. | 18 | * use the vDSO. |
| 18 | */ | 19 | */ |
| 19 | 20 | ||
| 20 | /* Disable profiling for userspace code: */ | 21 | /* Disable profiling for userspace code: */ |
| @@ -32,9 +33,12 @@ | |||
| 32 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
| 33 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
| 34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
| 36 | #include <linux/syscalls.h> | ||
| 37 | #include <linux/ratelimit.h> | ||
| 35 | 38 | ||
| 36 | #include <asm/vsyscall.h> | 39 | #include <asm/vsyscall.h> |
| 37 | #include <asm/pgtable.h> | 40 | #include <asm/pgtable.h> |
| 41 | #include <asm/compat.h> | ||
| 38 | #include <asm/page.h> | 42 | #include <asm/page.h> |
| 39 | #include <asm/unistd.h> | 43 | #include <asm/unistd.h> |
| 40 | #include <asm/fixmap.h> | 44 | #include <asm/fixmap.h> |
| @@ -44,16 +48,12 @@ | |||
| 44 | #include <asm/desc.h> | 48 | #include <asm/desc.h> |
| 45 | #include <asm/topology.h> | 49 | #include <asm/topology.h> |
| 46 | #include <asm/vgtod.h> | 50 | #include <asm/vgtod.h> |
| 47 | 51 | #include <asm/traps.h> | |
| 48 | #define __vsyscall(nr) \ | ||
| 49 | __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace | ||
| 50 | #define __syscall_clobber "r11","cx","memory" | ||
| 51 | 52 | ||
| 52 | DEFINE_VVAR(int, vgetcpu_mode); | 53 | DEFINE_VVAR(int, vgetcpu_mode); |
| 53 | DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = | 54 | DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = |
| 54 | { | 55 | { |
| 55 | .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), | 56 | .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), |
| 56 | .sysctl_enabled = 1, | ||
| 57 | }; | 57 | }; |
| 58 | 58 | ||
| 59 | void update_vsyscall_tz(void) | 59 | void update_vsyscall_tz(void) |
| @@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, | |||
| 72 | unsigned long flags; | 72 | unsigned long flags; |
| 73 | 73 | ||
| 74 | write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); | 74 | write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); |
| 75 | |||
| 75 | /* copy vsyscall data */ | 76 | /* copy vsyscall data */ |
| 76 | vsyscall_gtod_data.clock.vread = clock->vread; | 77 | vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; |
| 77 | vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; | 78 | vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; |
| 78 | vsyscall_gtod_data.clock.mask = clock->mask; | 79 | vsyscall_gtod_data.clock.mask = clock->mask; |
| 79 | vsyscall_gtod_data.clock.mult = mult; | 80 | vsyscall_gtod_data.clock.mult = mult; |
| 80 | vsyscall_gtod_data.clock.shift = clock->shift; | 81 | vsyscall_gtod_data.clock.shift = clock->shift; |
| 81 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | 82 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; |
| 82 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | 83 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; |
| 83 | vsyscall_gtod_data.wall_to_monotonic = *wtm; | 84 | vsyscall_gtod_data.wall_to_monotonic = *wtm; |
| 84 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); | 85 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); |
| 86 | |||
| 85 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 87 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
| 86 | } | 88 | } |
| 87 | 89 | ||
| 88 | /* RED-PEN may want to readd seq locking, but then the variable should be | 90 | static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, |
| 89 | * write-once. | 91 | const char *message) |
| 90 | */ | ||
| 91 | static __always_inline void do_get_tz(struct timezone * tz) | ||
| 92 | { | 92 | { |
| 93 | *tz = VVAR(vsyscall_gtod_data).sys_tz; | 93 | static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); |
| 94 | } | 94 | struct task_struct *tsk; |
| 95 | 95 | ||
| 96 | static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | 96 | if (!show_unhandled_signals || !__ratelimit(&rs)) |
| 97 | { | 97 | return; |
| 98 | int ret; | ||
| 99 | asm volatile("syscall" | ||
| 100 | : "=a" (ret) | ||
| 101 | : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) | ||
| 102 | : __syscall_clobber ); | ||
| 103 | return ret; | ||
| 104 | } | ||
| 105 | 98 | ||
| 106 | static __always_inline long time_syscall(long *t) | 99 | tsk = current; |
| 107 | { | ||
| 108 | long secs; | ||
| 109 | asm volatile("syscall" | ||
| 110 | : "=a" (secs) | ||
| 111 | : "0" (__NR_time),"D" (t) : __syscall_clobber); | ||
| 112 | return secs; | ||
| 113 | } | ||
| 114 | 100 | ||
| 115 | static __always_inline void do_vgettimeofday(struct timeval * tv) | 101 | printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", |
| 116 | { | 102 | level, tsk->comm, task_pid_nr(tsk), |
| 117 | cycle_t now, base, mask, cycle_delta; | 103 | message, regs->ip - 2, regs->cs, |
| 118 | unsigned seq; | 104 | regs->sp, regs->ax, regs->si, regs->di); |
| 119 | unsigned long mult, shift, nsec; | ||
| 120 | cycle_t (*vread)(void); | ||
| 121 | do { | ||
| 122 | seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); | ||
| 123 | |||
| 124 | vread = VVAR(vsyscall_gtod_data).clock.vread; | ||
| 125 | if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || | ||
| 126 | !vread)) { | ||
| 127 | gettimeofday(tv,NULL); | ||
| 128 | return; | ||
| 129 | } | ||
| 130 | |||
| 131 | now = vread(); | ||
| 132 | base = VVAR(vsyscall_gtod_data).clock.cycle_last; | ||
| 133 | mask = VVAR(vsyscall_gtod_data).clock.mask; | ||
| 134 | mult = VVAR(vsyscall_gtod_data).clock.mult; | ||
| 135 | shift = VVAR(vsyscall_gtod_data).clock.shift; | ||
| 136 | |||
| 137 | tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; | ||
| 138 | nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; | ||
| 139 | } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); | ||
| 140 | |||
| 141 | /* calculate interval: */ | ||
| 142 | cycle_delta = (now - base) & mask; | ||
| 143 | /* convert to nsecs: */ | ||
| 144 | nsec += (cycle_delta * mult) >> shift; | ||
| 145 | |||
| 146 | while (nsec >= NSEC_PER_SEC) { | ||
| 147 | tv->tv_sec += 1; | ||
| 148 | nsec -= NSEC_PER_SEC; | ||
| 149 | } | ||
| 150 | tv->tv_usec = nsec / NSEC_PER_USEC; | ||
| 151 | } | 105 | } |
| 152 | 106 | ||
| 153 | int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | 107 | static int addr_to_vsyscall_nr(unsigned long addr) |
| 154 | { | 108 | { |
| 155 | if (tv) | 109 | int nr; |
| 156 | do_vgettimeofday(tv); | ||
| 157 | if (tz) | ||
| 158 | do_get_tz(tz); | ||
| 159 | return 0; | ||
| 160 | } | ||
| 161 | 110 | ||
| 162 | /* This will break when the xtime seconds get inaccurate, but that is | 111 | if ((addr & ~0xC00UL) != VSYSCALL_START) |
| 163 | * unlikely */ | 112 | return -EINVAL; |
| 164 | time_t __vsyscall(1) vtime(time_t *t) | ||
| 165 | { | ||
| 166 | unsigned seq; | ||
| 167 | time_t result; | ||
| 168 | if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) | ||
| 169 | return time_syscall(t); | ||
| 170 | 113 | ||
| 171 | do { | 114 | nr = (addr & 0xC00UL) >> 10; |
| 172 | seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); | 115 | if (nr >= 3) |
| 116 | return -EINVAL; | ||
| 173 | 117 | ||
| 174 | result = VVAR(vsyscall_gtod_data).wall_time_sec; | 118 | return nr; |
| 119 | } | ||
| 175 | 120 | ||
| 176 | } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); | 121 | void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) |
| 122 | { | ||
| 123 | struct task_struct *tsk; | ||
| 124 | unsigned long caller; | ||
| 125 | int vsyscall_nr; | ||
| 126 | long ret; | ||
| 127 | |||
| 128 | local_irq_enable(); | ||
| 129 | |||
| 130 | /* | ||
| 131 | * Real 64-bit user mode code has cs == __USER_CS. Anything else | ||
| 132 | * is bogus. | ||
| 133 | */ | ||
| 134 | if (regs->cs != __USER_CS) { | ||
| 135 | /* | ||
| 136 | * If we trapped from kernel mode, we might as well OOPS now | ||
| 137 | * instead of returning to some random address and OOPSing | ||
| 138 | * then. | ||
| 139 | */ | ||
| 140 | BUG_ON(!user_mode(regs)); | ||
| 141 | |||
| 142 | /* Compat mode and non-compat 32-bit CS should both segfault. */ | ||
| 143 | warn_bad_vsyscall(KERN_WARNING, regs, | ||
| 144 | "illegal int 0xcc from 32-bit mode"); | ||
| 145 | goto sigsegv; | ||
| 146 | } | ||
| 177 | 147 | ||
| 178 | if (t) | 148 | /* |
| 179 | *t = result; | 149 | * x86-ism here: regs->ip points to the instruction after the int 0xcc, |
| 180 | return result; | 150 | * and int 0xcc is two bytes long. |
| 181 | } | 151 | */ |
| 152 | vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); | ||
| 153 | if (vsyscall_nr < 0) { | ||
| 154 | warn_bad_vsyscall(KERN_WARNING, regs, | ||
| 155 | "illegal int 0xcc (exploit attempt?)"); | ||
| 156 | goto sigsegv; | ||
| 157 | } | ||
| 182 | 158 | ||
| 183 | /* Fast way to get current CPU and node. | 159 | if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { |
| 184 | This helps to do per node and per CPU caches in user space. | 160 | warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); |
| 185 | The result is not guaranteed without CPU affinity, but usually | 161 | goto sigsegv; |
| 186 | works out because the scheduler tries to keep a thread on the same | 162 | } |
| 187 | CPU. | ||
| 188 | 163 | ||
| 189 | tcache must point to a two element sized long array. | 164 | tsk = current; |
| 190 | All arguments can be NULL. */ | 165 | if (seccomp_mode(&tsk->seccomp)) |
| 191 | long __vsyscall(2) | 166 | do_exit(SIGKILL); |
| 192 | vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) | 167 | |
| 193 | { | 168 | switch (vsyscall_nr) { |
| 194 | unsigned int p; | 169 | case 0: |
| 195 | unsigned long j = 0; | 170 | ret = sys_gettimeofday( |
| 196 | 171 | (struct timeval __user *)regs->di, | |
| 197 | /* Fast cache - only recompute value once per jiffies and avoid | 172 | (struct timezone __user *)regs->si); |
| 198 | relatively costly rdtscp/cpuid otherwise. | 173 | break; |
| 199 | This works because the scheduler usually keeps the process | 174 | |
| 200 | on the same CPU and this syscall doesn't guarantee its | 175 | case 1: |
| 201 | results anyways. | 176 | ret = sys_time((time_t __user *)regs->di); |
| 202 | We do this here because otherwise user space would do it on | 177 | break; |
| 203 | its own in a likely inferior way (no access to jiffies). | 178 | |
| 204 | If you don't like it pass NULL. */ | 179 | case 2: |
| 205 | if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { | 180 | ret = sys_getcpu((unsigned __user *)regs->di, |
| 206 | p = tcache->blob[1]; | 181 | (unsigned __user *)regs->si, |
| 207 | } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { | 182 | 0); |
| 208 | /* Load per CPU data from RDTSCP */ | 183 | break; |
| 209 | native_read_tscp(&p); | ||
| 210 | } else { | ||
| 211 | /* Load per CPU data from GDT */ | ||
| 212 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | ||
| 213 | } | 184 | } |
| 214 | if (tcache) { | 185 | |
| 215 | tcache->blob[0] = j; | 186 | if (ret == -EFAULT) { |
| 216 | tcache->blob[1] = p; | 187 | /* |
| 188 | * Bad news -- userspace fed a bad pointer to a vsyscall. | ||
| 189 | * | ||
| 190 | * With a real vsyscall, that would have caused SIGSEGV. | ||
| 191 | * To make writing reliable exploits using the emulated | ||
| 192 | * vsyscalls harder, generate SIGSEGV here as well. | ||
| 193 | */ | ||
| 194 | warn_bad_vsyscall(KERN_INFO, regs, | ||
| 195 | "vsyscall fault (exploit attempt?)"); | ||
| 196 | goto sigsegv; | ||
| 217 | } | 197 | } |
| 218 | if (cpu) | ||
| 219 | *cpu = p & 0xfff; | ||
| 220 | if (node) | ||
| 221 | *node = p >> 12; | ||
| 222 | return 0; | ||
| 223 | } | ||
| 224 | 198 | ||
| 225 | static long __vsyscall(3) venosys_1(void) | 199 | regs->ax = ret; |
| 226 | { | ||
| 227 | return -ENOSYS; | ||
| 228 | } | ||
| 229 | 200 | ||
| 230 | #ifdef CONFIG_SYSCTL | 201 | /* Emulate a ret instruction. */ |
| 231 | static ctl_table kernel_table2[] = { | 202 | regs->ip = caller; |
| 232 | { .procname = "vsyscall64", | 203 | regs->sp += 8; |
| 233 | .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), | ||
| 234 | .mode = 0644, | ||
| 235 | .proc_handler = proc_dointvec }, | ||
| 236 | {} | ||
| 237 | }; | ||
| 238 | 204 | ||
| 239 | static ctl_table kernel_root_table2[] = { | 205 | local_irq_disable(); |
| 240 | { .procname = "kernel", .mode = 0555, | 206 | return; |
| 241 | .child = kernel_table2 }, | 207 | |
| 242 | {} | 208 | sigsegv: |
| 243 | }; | 209 | regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ |
| 244 | #endif | 210 | force_sig(SIGSEGV, current); |
| 211 | local_irq_disable(); | ||
| 212 | } | ||
| 245 | 213 | ||
| 246 | /* Assume __initcall executes before all user space. Hopefully kmod | 214 | /* |
| 247 | doesn't violate that. We'll find out if it does. */ | 215 | * Assume __initcall executes before all user space. Hopefully kmod |
| 216 | * doesn't violate that. We'll find out if it does. | ||
| 217 | */ | ||
| 248 | static void __cpuinit vsyscall_set_cpu(int cpu) | 218 | static void __cpuinit vsyscall_set_cpu(int cpu) |
| 249 | { | 219 | { |
| 250 | unsigned long d; | 220 | unsigned long d; |
| @@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu) | |||
| 255 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) | 225 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) |
| 256 | write_rdtscp_aux((node << 12) | cpu); | 226 | write_rdtscp_aux((node << 12) | cpu); |
| 257 | 227 | ||
| 258 | /* Store cpu number in limit so that it can be loaded quickly | 228 | /* |
| 259 | in user space in vgetcpu. | 229 | * Store cpu number in limit so that it can be loaded quickly |
| 260 | 12 bits for the CPU and 8 bits for the node. */ | 230 | * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) |
| 231 | */ | ||
| 261 | d = 0x0f40000000000ULL; | 232 | d = 0x0f40000000000ULL; |
| 262 | d |= cpu; | 233 | d |= cpu; |
| 263 | d |= (node & 0xf) << 12; | 234 | d |= (node & 0xf) << 12; |
| 264 | d |= (node >> 4) << 48; | 235 | d |= (node >> 4) << 48; |
| 236 | |||
| 265 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); | 237 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); |
| 266 | } | 238 | } |
| 267 | 239 | ||
| @@ -275,8 +247,10 @@ static int __cpuinit | |||
| 275 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) | 247 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) |
| 276 | { | 248 | { |
| 277 | long cpu = (long)arg; | 249 | long cpu = (long)arg; |
| 250 | |||
| 278 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | 251 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) |
| 279 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); | 252 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); |
| 253 | |||
| 280 | return NOTIFY_DONE; | 254 | return NOTIFY_DONE; |
| 281 | } | 255 | } |
| 282 | 256 | ||
| @@ -284,25 +258,23 @@ void __init map_vsyscall(void) | |||
| 284 | { | 258 | { |
| 285 | extern char __vsyscall_0; | 259 | extern char __vsyscall_0; |
| 286 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | 260 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); |
| 261 | extern char __vvar_page; | ||
| 262 | unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); | ||
| 287 | 263 | ||
| 288 | /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ | 264 | /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ |
| 289 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); | 265 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); |
| 266 | __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); | ||
| 267 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); | ||
| 290 | } | 268 | } |
| 291 | 269 | ||
| 292 | static int __init vsyscall_init(void) | 270 | static int __init vsyscall_init(void) |
| 293 | { | 271 | { |
| 294 | BUG_ON(((unsigned long) &vgettimeofday != | 272 | BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); |
| 295 | VSYSCALL_ADDR(__NR_vgettimeofday))); | 273 | |
| 296 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | ||
| 297 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | ||
| 298 | BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); | ||
| 299 | #ifdef CONFIG_SYSCTL | ||
| 300 | register_sysctl_table(kernel_root_table2); | ||
| 301 | #endif | ||
| 302 | on_each_cpu(cpu_vsyscall_init, NULL, 1); | 274 | on_each_cpu(cpu_vsyscall_init, NULL, 1); |
| 303 | /* notifier priority > KVM */ | 275 | /* notifier priority > KVM */ |
| 304 | hotcpu_notifier(cpu_vsyscall_notifier, 30); | 276 | hotcpu_notifier(cpu_vsyscall_notifier, 30); |
| 277 | |||
| 305 | return 0; | 278 | return 0; |
| 306 | } | 279 | } |
| 307 | |||
| 308 | __initcall(vsyscall_init); | 280 | __initcall(vsyscall_init); |
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S new file mode 100644 index 000000000000..ffa845eae5ca --- /dev/null +++ b/arch/x86/kernel/vsyscall_emu_64.S | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | /* | ||
| 2 | * vsyscall_emu_64.S: Vsyscall emulation page | ||
| 3 | * | ||
| 4 | * Copyright (c) 2011 Andy Lutomirski | ||
| 5 | * | ||
| 6 | * Subject to the GNU General Public License, version 2 | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/linkage.h> | ||
| 10 | #include <asm/irq_vectors.h> | ||
| 11 | |||
| 12 | /* The unused parts of the page are filled with 0xcc by the linker script. */ | ||
| 13 | |||
| 14 | .section .vsyscall_0, "a" | ||
| 15 | ENTRY(vsyscall_0) | ||
| 16 | int $VSYSCALL_EMU_VECTOR | ||
| 17 | END(vsyscall_0) | ||
| 18 | |||
| 19 | .section .vsyscall_1, "a" | ||
| 20 | ENTRY(vsyscall_1) | ||
| 21 | int $VSYSCALL_EMU_VECTOR | ||
| 22 | END(vsyscall_1) | ||
| 23 | |||
| 24 | .section .vsyscall_2, "a" | ||
| 25 | ENTRY(vsyscall_2) | ||
| 26 | int $VSYSCALL_EMU_VECTOR | ||
| 27 | END(vsyscall_2) | ||
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6fec2d1cebe1..01c805ba5359 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | 2 | ||
| 3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
| 4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
| 5 | #include <asm/alternative-asm.h> | ||
| 5 | 6 | ||
| 6 | ALIGN | 7 | ALIGN |
| 7 | copy_page_c: | 8 | copy_page_c: |
| @@ -110,10 +111,6 @@ ENDPROC(copy_page) | |||
| 110 | 2: | 111 | 2: |
| 111 | .previous | 112 | .previous |
| 112 | .section .altinstructions,"a" | 113 | .section .altinstructions,"a" |
| 113 | .align 8 | 114 | altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ |
| 114 | .quad copy_page | 115 | .Lcopy_page_end-copy_page, 2b-1b |
| 115 | .quad 1b | ||
| 116 | .word X86_FEATURE_REP_GOOD | ||
| 117 | .byte .Lcopy_page_end - copy_page | ||
| 118 | .byte 2b - 1b | ||
| 119 | .previous | 116 | .previous |
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index d0ec9c2936d7..ee164610ec46 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
| 10 | #include <asm/dwarf2.h> | 10 | #include <asm/dwarf2.h> |
| 11 | #include <asm/cpufeature.h> | 11 | #include <asm/cpufeature.h> |
| 12 | #include <asm/alternative-asm.h> | ||
| 12 | 13 | ||
| 13 | #undef memmove | 14 | #undef memmove |
| 14 | 15 | ||
| @@ -214,11 +215,9 @@ ENTRY(memmove) | |||
| 214 | .previous | 215 | .previous |
| 215 | 216 | ||
| 216 | .section .altinstructions,"a" | 217 | .section .altinstructions,"a" |
| 217 | .align 8 | 218 | altinstruction_entry .Lmemmove_begin_forward, \ |
| 218 | .quad .Lmemmove_begin_forward | 219 | .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ |
| 219 | .quad .Lmemmove_begin_forward_efs | 220 | .Lmemmove_end_forward-.Lmemmove_begin_forward, \ |
| 220 | .word X86_FEATURE_ERMS | 221 | .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs |
| 221 | .byte .Lmemmove_end_forward-.Lmemmove_begin_forward | ||
| 222 | .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
| 223 | .previous | 222 | .previous |
| 224 | ENDPROC(memmove) | 223 | ENDPROC(memmove) |
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index bef0bc962400..5d179502a52c 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
| @@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) | |||
| 26 | export CPPFLAGS_vdso.lds += -P -C | 26 | export CPPFLAGS_vdso.lds += -P -C |
| 27 | 27 | ||
| 28 | VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ | 28 | VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ |
| 29 | -Wl,--no-undefined \ | ||
| 29 | -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 | 30 | -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 |
| 30 | 31 | ||
| 31 | $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so | 32 | $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so |
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index a724905fdae7..6bc0e723b6e8 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c | |||
| @@ -6,7 +6,6 @@ | |||
| 6 | * | 6 | * |
| 7 | * The code should have no internal unresolved relocations. | 7 | * The code should have no internal unresolved relocations. |
| 8 | * Check with readelf after changing. | 8 | * Check with readelf after changing. |
| 9 | * Also alternative() doesn't work. | ||
| 10 | */ | 9 | */ |
| 11 | 10 | ||
| 12 | /* Disable profiling for userspace code: */ | 11 | /* Disable profiling for userspace code: */ |
| @@ -17,6 +16,7 @@ | |||
| 17 | #include <linux/time.h> | 16 | #include <linux/time.h> |
| 18 | #include <linux/string.h> | 17 | #include <linux/string.h> |
| 19 | #include <asm/vsyscall.h> | 18 | #include <asm/vsyscall.h> |
| 19 | #include <asm/fixmap.h> | ||
| 20 | #include <asm/vgtod.h> | 20 | #include <asm/vgtod.h> |
| 21 | #include <asm/timex.h> | 21 | #include <asm/timex.h> |
| 22 | #include <asm/hpet.h> | 22 | #include <asm/hpet.h> |
| @@ -25,6 +25,43 @@ | |||
| 25 | 25 | ||
| 26 | #define gtod (&VVAR(vsyscall_gtod_data)) | 26 | #define gtod (&VVAR(vsyscall_gtod_data)) |
| 27 | 27 | ||
| 28 | notrace static cycle_t vread_tsc(void) | ||
| 29 | { | ||
| 30 | cycle_t ret; | ||
| 31 | u64 last; | ||
| 32 | |||
| 33 | /* | ||
| 34 | * Empirically, a fence (of type that depends on the CPU) | ||
| 35 | * before rdtsc is enough to ensure that rdtsc is ordered | ||
| 36 | * with respect to loads. The various CPU manuals are unclear | ||
| 37 | * as to whether rdtsc can be reordered with later loads, | ||
| 38 | * but no one has ever seen it happen. | ||
| 39 | */ | ||
| 40 | rdtsc_barrier(); | ||
| 41 | ret = (cycle_t)vget_cycles(); | ||
| 42 | |||
| 43 | last = VVAR(vsyscall_gtod_data).clock.cycle_last; | ||
| 44 | |||
| 45 | if (likely(ret >= last)) | ||
| 46 | return ret; | ||
| 47 | |||
| 48 | /* | ||
| 49 | * GCC likes to generate cmov here, but this branch is extremely | ||
| 50 | * predictable (it's just a funciton of time and the likely is | ||
| 51 | * very likely) and there's a data dependence, so force GCC | ||
| 52 | * to generate a branch instead. I don't barrier() because | ||
| 53 | * we don't actually need a barrier, and if this function | ||
| 54 | * ever gets inlined it will generate worse code. | ||
| 55 | */ | ||
| 56 | asm volatile (""); | ||
| 57 | return last; | ||
| 58 | } | ||
| 59 | |||
| 60 | static notrace cycle_t vread_hpet(void) | ||
| 61 | { | ||
| 62 | return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); | ||
| 63 | } | ||
| 64 | |||
| 28 | notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) | 65 | notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) |
| 29 | { | 66 | { |
| 30 | long ret; | 67 | long ret; |
| @@ -36,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) | |||
| 36 | notrace static inline long vgetns(void) | 73 | notrace static inline long vgetns(void) |
| 37 | { | 74 | { |
| 38 | long v; | 75 | long v; |
| 39 | cycles_t (*vread)(void); | 76 | cycles_t cycles; |
| 40 | vread = gtod->clock.vread; | 77 | if (gtod->clock.vclock_mode == VCLOCK_TSC) |
| 41 | v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; | 78 | cycles = vread_tsc(); |
| 79 | else | ||
| 80 | cycles = vread_hpet(); | ||
| 81 | v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; | ||
| 42 | return (v * gtod->clock.mult) >> gtod->clock.shift; | 82 | return (v * gtod->clock.mult) >> gtod->clock.shift; |
| 43 | } | 83 | } |
| 44 | 84 | ||
| @@ -116,21 +156,21 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) | |||
| 116 | 156 | ||
| 117 | notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) | 157 | notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) |
| 118 | { | 158 | { |
| 119 | if (likely(gtod->sysctl_enabled)) | 159 | switch (clock) { |
| 120 | switch (clock) { | 160 | case CLOCK_REALTIME: |
| 121 | case CLOCK_REALTIME: | 161 | if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) |
| 122 | if (likely(gtod->clock.vread)) | 162 | return do_realtime(ts); |
| 123 | return do_realtime(ts); | 163 | break; |
| 124 | break; | 164 | case CLOCK_MONOTONIC: |
| 125 | case CLOCK_MONOTONIC: | 165 | if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) |
| 126 | if (likely(gtod->clock.vread)) | 166 | return do_monotonic(ts); |
| 127 | return do_monotonic(ts); | 167 | break; |
| 128 | break; | 168 | case CLOCK_REALTIME_COARSE: |
| 129 | case CLOCK_REALTIME_COARSE: | 169 | return do_realtime_coarse(ts); |
| 130 | return do_realtime_coarse(ts); | 170 | case CLOCK_MONOTONIC_COARSE: |
| 131 | case CLOCK_MONOTONIC_COARSE: | 171 | return do_monotonic_coarse(ts); |
| 132 | return do_monotonic_coarse(ts); | 172 | } |
| 133 | } | 173 | |
| 134 | return vdso_fallback_gettime(clock, ts); | 174 | return vdso_fallback_gettime(clock, ts); |
| 135 | } | 175 | } |
| 136 | int clock_gettime(clockid_t, struct timespec *) | 176 | int clock_gettime(clockid_t, struct timespec *) |
| @@ -139,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *) | |||
| 139 | notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) | 179 | notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) |
| 140 | { | 180 | { |
| 141 | long ret; | 181 | long ret; |
| 142 | if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { | 182 | if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) { |
| 143 | if (likely(tv != NULL)) { | 183 | if (likely(tv != NULL)) { |
| 144 | BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != | 184 | BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != |
| 145 | offsetof(struct timespec, tv_nsec) || | 185 | offsetof(struct timespec, tv_nsec) || |
| @@ -161,27 +201,14 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) | |||
| 161 | int gettimeofday(struct timeval *, struct timezone *) | 201 | int gettimeofday(struct timeval *, struct timezone *) |
| 162 | __attribute__((weak, alias("__vdso_gettimeofday"))); | 202 | __attribute__((weak, alias("__vdso_gettimeofday"))); |
| 163 | 203 | ||
| 164 | /* This will break when the xtime seconds get inaccurate, but that is | 204 | /* |
| 165 | * unlikely */ | 205 | * This will break when the xtime seconds get inaccurate, but that is |
| 166 | 206 | * unlikely | |
| 167 | static __always_inline long time_syscall(long *t) | 207 | */ |
| 168 | { | ||
| 169 | long secs; | ||
| 170 | asm volatile("syscall" | ||
| 171 | : "=a" (secs) | ||
| 172 | : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); | ||
| 173 | return secs; | ||
| 174 | } | ||
| 175 | |||
| 176 | notrace time_t __vdso_time(time_t *t) | 208 | notrace time_t __vdso_time(time_t *t) |
| 177 | { | 209 | { |
| 178 | time_t result; | ||
| 179 | |||
| 180 | if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) | ||
| 181 | return time_syscall(t); | ||
| 182 | |||
| 183 | /* This is atomic on x86_64 so we don't need any locks. */ | 210 | /* This is atomic on x86_64 so we don't need any locks. */ |
| 184 | result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); | 211 | time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); |
| 185 | 212 | ||
| 186 | if (t) | 213 | if (t) |
| 187 | *t = result; | 214 | *t = result; |
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1d3aa6b87181..1b979c12ba85 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S | |||
| @@ -1,10 +1,21 @@ | |||
| 1 | #include <asm/page_types.h> | ||
| 2 | #include <linux/linkage.h> | ||
| 1 | #include <linux/init.h> | 3 | #include <linux/init.h> |
| 2 | 4 | ||
| 3 | __INITDATA | 5 | __PAGE_ALIGNED_DATA |
| 4 | 6 | ||
| 5 | .globl vdso_start, vdso_end | 7 | .globl vdso_start, vdso_end |
| 8 | .align PAGE_SIZE | ||
| 6 | vdso_start: | 9 | vdso_start: |
| 7 | .incbin "arch/x86/vdso/vdso.so" | 10 | .incbin "arch/x86/vdso/vdso.so" |
| 8 | vdso_end: | 11 | vdso_end: |
| 9 | 12 | ||
| 10 | __FINIT | 13 | .previous |
| 14 | |||
| 15 | .globl vdso_pages | ||
| 16 | .bss | ||
| 17 | .align 8 | ||
| 18 | .type vdso_pages, @object | ||
| 19 | vdso_pages: | ||
| 20 | .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 | ||
| 21 | .size vdso_pages, .-vdso_pages | ||
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7abd2be0f9b9..316fbca3490e 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c | |||
| @@ -14,41 +14,61 @@ | |||
| 14 | #include <asm/vgtod.h> | 14 | #include <asm/vgtod.h> |
| 15 | #include <asm/proto.h> | 15 | #include <asm/proto.h> |
| 16 | #include <asm/vdso.h> | 16 | #include <asm/vdso.h> |
| 17 | #include <asm/page.h> | ||
| 17 | 18 | ||
| 18 | unsigned int __read_mostly vdso_enabled = 1; | 19 | unsigned int __read_mostly vdso_enabled = 1; |
| 19 | 20 | ||
| 20 | extern char vdso_start[], vdso_end[]; | 21 | extern char vdso_start[], vdso_end[]; |
| 21 | extern unsigned short vdso_sync_cpuid; | 22 | extern unsigned short vdso_sync_cpuid; |
| 22 | 23 | ||
| 23 | static struct page **vdso_pages; | 24 | extern struct page *vdso_pages[]; |
| 24 | static unsigned vdso_size; | 25 | static unsigned vdso_size; |
| 25 | 26 | ||
| 26 | static int __init init_vdso_vars(void) | 27 | static void __init patch_vdso(void *vdso, size_t len) |
| 28 | { | ||
| 29 | Elf64_Ehdr *hdr = vdso; | ||
| 30 | Elf64_Shdr *sechdrs, *alt_sec = 0; | ||
| 31 | char *secstrings; | ||
| 32 | void *alt_data; | ||
| 33 | int i; | ||
| 34 | |||
| 35 | BUG_ON(len < sizeof(Elf64_Ehdr)); | ||
| 36 | BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); | ||
| 37 | |||
| 38 | sechdrs = (void *)hdr + hdr->e_shoff; | ||
| 39 | secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
| 40 | |||
| 41 | for (i = 1; i < hdr->e_shnum; i++) { | ||
| 42 | Elf64_Shdr *shdr = &sechdrs[i]; | ||
| 43 | if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { | ||
| 44 | alt_sec = shdr; | ||
| 45 | goto found; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | |||
| 49 | /* If we get here, it's probably a bug. */ | ||
| 50 | pr_warning("patch_vdso: .altinstructions not found\n"); | ||
| 51 | return; /* nothing to patch */ | ||
| 52 | |||
| 53 | found: | ||
| 54 | alt_data = (void *)hdr + alt_sec->sh_offset; | ||
| 55 | apply_alternatives(alt_data, alt_data + alt_sec->sh_size); | ||
| 56 | } | ||
| 57 | |||
| 58 | static int __init init_vdso(void) | ||
| 27 | { | 59 | { |
| 28 | int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; | 60 | int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; |
| 29 | int i; | 61 | int i; |
| 30 | 62 | ||
| 63 | patch_vdso(vdso_start, vdso_end - vdso_start); | ||
| 64 | |||
| 31 | vdso_size = npages << PAGE_SHIFT; | 65 | vdso_size = npages << PAGE_SHIFT; |
| 32 | vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); | 66 | for (i = 0; i < npages; i++) |
| 33 | if (!vdso_pages) | 67 | vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); |
| 34 | goto oom; | ||
| 35 | for (i = 0; i < npages; i++) { | ||
| 36 | struct page *p; | ||
| 37 | p = alloc_page(GFP_KERNEL); | ||
| 38 | if (!p) | ||
| 39 | goto oom; | ||
| 40 | vdso_pages[i] = p; | ||
| 41 | copy_page(page_address(p), vdso_start + i*PAGE_SIZE); | ||
| 42 | } | ||
| 43 | 68 | ||
| 44 | return 0; | 69 | return 0; |
| 45 | |||
| 46 | oom: | ||
| 47 | printk("Cannot allocate vdso\n"); | ||
| 48 | vdso_enabled = 0; | ||
| 49 | return -ENOMEM; | ||
| 50 | } | 70 | } |
| 51 | subsys_initcall(init_vdso_vars); | 71 | subsys_initcall(init_vdso); |
| 52 | 72 | ||
| 53 | struct linux_binprm; | 73 | struct linux_binprm; |
| 54 | 74 | ||
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 34d6a1cab8de..0833896cf6f2 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c | |||
| @@ -952,7 +952,7 @@ int hpet_alloc(struct hpet_data *hdp) | |||
| 952 | #ifdef CONFIG_IA64 | 952 | #ifdef CONFIG_IA64 |
| 953 | if (!hpet_clocksource) { | 953 | if (!hpet_clocksource) { |
| 954 | hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc; | 954 | hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc; |
| 955 | CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr); | 955 | clocksource_hpet.archdata.fsys_mmio = hpet_mctr; |
| 956 | clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq); | 956 | clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq); |
| 957 | hpetp->hp_clocksource = &clocksource_hpet; | 957 | hpetp->hp_clocksource = &clocksource_hpet; |
| 958 | hpet_clocksource = &clocksource_hpet; | 958 | hpet_clocksource = &clocksource_hpet; |
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 18a1baf31f2d..139c4db55f17 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h | |||
| @@ -22,6 +22,10 @@ | |||
| 22 | typedef u64 cycle_t; | 22 | typedef u64 cycle_t; |
| 23 | struct clocksource; | 23 | struct clocksource; |
| 24 | 24 | ||
| 25 | #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA | ||
| 26 | #include <asm/clocksource.h> | ||
| 27 | #endif | ||
| 28 | |||
| 25 | /** | 29 | /** |
| 26 | * struct cyclecounter - hardware abstraction for a free running counter | 30 | * struct cyclecounter - hardware abstraction for a free running counter |
| 27 | * Provides completely state-free accessors to the underlying hardware. | 31 | * Provides completely state-free accessors to the underlying hardware. |
| @@ -153,7 +157,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, | |||
| 153 | * @shift: cycle to nanosecond divisor (power of two) | 157 | * @shift: cycle to nanosecond divisor (power of two) |
| 154 | * @max_idle_ns: max idle time permitted by the clocksource (nsecs) | 158 | * @max_idle_ns: max idle time permitted by the clocksource (nsecs) |
| 155 | * @flags: flags describing special properties | 159 | * @flags: flags describing special properties |
| 156 | * @vread: vsyscall based read | 160 | * @archdata: arch-specific data |
| 157 | * @suspend: suspend function for the clocksource, if necessary | 161 | * @suspend: suspend function for the clocksource, if necessary |
| 158 | * @resume: resume function for the clocksource, if necessary | 162 | * @resume: resume function for the clocksource, if necessary |
| 159 | */ | 163 | */ |
| @@ -169,16 +173,13 @@ struct clocksource { | |||
| 169 | u32 shift; | 173 | u32 shift; |
| 170 | u64 max_idle_ns; | 174 | u64 max_idle_ns; |
| 171 | 175 | ||
| 172 | #ifdef CONFIG_IA64 | 176 | #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA |
| 173 | void *fsys_mmio; /* used by fsyscall asm code */ | 177 | struct arch_clocksource_data archdata; |
| 174 | #define CLKSRC_FSYS_MMIO_SET(mmio, addr) ((mmio) = (addr)) | ||
| 175 | #else | ||
| 176 | #define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0) | ||
| 177 | #endif | 178 | #endif |
| 179 | |||
| 178 | const char *name; | 180 | const char *name; |
| 179 | struct list_head list; | 181 | struct list_head list; |
| 180 | int rating; | 182 | int rating; |
| 181 | cycle_t (*vread)(void); | ||
| 182 | int (*enable)(struct clocksource *cs); | 183 | int (*enable)(struct clocksource *cs); |
| 183 | void (*disable)(struct clocksource *cs); | 184 | void (*disable)(struct clocksource *cs); |
| 184 | unsigned long flags; | 185 | unsigned long flags; |
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 167c33361d9c..cc7a4e9cc7ad 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h | |||
| @@ -19,6 +19,11 @@ static inline void secure_computing(int this_syscall) | |||
| 19 | extern long prctl_get_seccomp(void); | 19 | extern long prctl_get_seccomp(void); |
| 20 | extern long prctl_set_seccomp(unsigned long); | 20 | extern long prctl_set_seccomp(unsigned long); |
| 21 | 21 | ||
| 22 | static inline int seccomp_mode(seccomp_t *s) | ||
| 23 | { | ||
| 24 | return s->mode; | ||
| 25 | } | ||
| 26 | |||
| 22 | #else /* CONFIG_SECCOMP */ | 27 | #else /* CONFIG_SECCOMP */ |
| 23 | 28 | ||
| 24 | #include <linux/errno.h> | 29 | #include <linux/errno.h> |
| @@ -37,6 +42,11 @@ static inline long prctl_set_seccomp(unsigned long arg2) | |||
| 37 | return -EINVAL; | 42 | return -EINVAL; |
| 38 | } | 43 | } |
| 39 | 44 | ||
| 45 | static inline int seccomp_mode(seccomp_t *s) | ||
| 46 | { | ||
| 47 | return 0; | ||
| 48 | } | ||
| 49 | |||
| 40 | #endif /* CONFIG_SECCOMP */ | 50 | #endif /* CONFIG_SECCOMP */ |
| 41 | 51 | ||
| 42 | #endif /* _LINUX_SECCOMP_H */ | 52 | #endif /* _LINUX_SECCOMP_H */ |
