aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/hardirq.h2
-rw-r--r--include/asm-generic/pgtable.h4
-rw-r--r--include/asm-generic/vmlinux.lds.h14
-rw-r--r--include/linux/acpi_pmtmr.h2
-rw-r--r--include/linux/ceph/auth.h92
-rw-r--r--include/linux/ceph/buffer.h39
-rw-r--r--include/linux/ceph/ceph_debug.h38
-rw-r--r--include/linux/ceph/ceph_frag.h109
-rw-r--r--include/linux/ceph/ceph_fs.h729
-rw-r--r--include/linux/ceph/ceph_hash.h13
-rw-r--r--include/linux/ceph/debugfs.h33
-rw-r--r--include/linux/ceph/decode.h201
-rw-r--r--include/linux/ceph/libceph.h249
-rw-r--r--include/linux/ceph/mdsmap.h62
-rw-r--r--include/linux/ceph/messenger.h261
-rw-r--r--include/linux/ceph/mon_client.h122
-rw-r--r--include/linux/ceph/msgpool.h25
-rw-r--r--include/linux/ceph/msgr.h175
-rw-r--r--include/linux/ceph/osd_client.h234
-rw-r--r--include/linux/ceph/osdmap.h130
-rw-r--r--include/linux/ceph/pagelist.h75
-rw-r--r--include/linux/ceph/rados.h405
-rw-r--r--include/linux/ceph/types.h29
-rw-r--r--include/linux/cgroup.h4
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/cred.h2
-rw-r--r--include/linux/crush/crush.h180
-rw-r--r--include/linux/crush/hash.h17
-rw-r--r--include/linux/crush/mapper.h20
-rw-r--r--include/linux/debug_locks.h5
-rw-r--r--include/linux/dynamic_debug.h39
-rw-r--r--include/linux/edac.h4
-rw-r--r--include/linux/fdtable.h6
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/ftrace_event.h8
-rw-r--r--include/linux/genhd.h6
-rw-r--r--include/linux/hardirq.h11
-rw-r--r--include/linux/idr.h4
-rw-r--r--include/linux/init_task.h14
-rw-r--r--include/linux/input.h2
-rw-r--r--include/linux/interrupt.h8
-rw-r--r--include/linux/iocontext.h2
-rw-r--r--include/linux/irq_work.h20
-rw-r--r--include/linux/jump_label.h74
-rw-r--r--include/linux/jump_label_ref.h44
-rw-r--r--include/linux/kernel.h13
-rw-r--r--include/linux/key.h3
-rw-r--r--include/linux/kvm_host.h2
-rw-r--r--include/linux/lockdep.h13
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/module.h5
-rw-r--r--include/linux/netfilter/nfnetlink_conntrack.h10
-rw-r--r--include/linux/netfilter/xt_SECMARK.h12
-rw-r--r--include/linux/nfs_fs.h2
-rw-r--r--include/linux/notifier.h10
-rw-r--r--include/linux/oprofile.h7
-rw-r--r--include/linux/pci_ids.h1
-rw-r--r--include/linux/percpu-defs.h9
-rw-r--r--include/linux/percpu.h9
-rw-r--r--include/linux/perf_event.h212
-rw-r--r--include/linux/radix-tree.h4
-rw-r--r--include/linux/rculist.h62
-rw-r--r--include/linux/rculist_nulls.h16
-rw-r--r--include/linux/rcupdate.h490
-rw-r--r--include/linux/rcutiny.h104
-rw-r--r--include/linux/rcutree.h57
-rw-r--r--include/linux/sched.h48
-rw-r--r--include/linux/security.h45
-rw-r--r--include/linux/selinux.h63
-rw-r--r--include/linux/srcu.h34
-rw-r--r--include/linux/stop_machine.h10
-rw-r--r--include/linux/sunrpc/auth_gss.h4
-rw-r--r--include/linux/thread_info.h4
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/linux/tracepoint.h5
-rw-r--r--include/net/cls_cgroup.h3
-rw-r--r--include/net/netfilter/nf_conntrack.h2
-rw-r--r--include/trace/events/irq.h26
-rw-r--r--include/trace/events/napi.h25
-rw-r--r--include/trace/events/net.h82
-rw-r--r--include/trace/events/power.h90
-rw-r--r--include/trace/events/sched.h29
-rw-r--r--include/trace/events/skb.h17
83 files changed, 4513 insertions, 542 deletions
diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h
index 62f59080e5cc..04d0a977cd43 100644
--- a/include/asm-generic/hardirq.h
+++ b/include/asm-generic/hardirq.h
@@ -3,13 +3,13 @@
3 3
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/irq.h>
7 6
8typedef struct { 7typedef struct {
9 unsigned int __softirq_pending; 8 unsigned int __softirq_pending;
10} ____cacheline_aligned irq_cpustat_t; 9} ____cacheline_aligned irq_cpustat_t;
11 10
12#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ 11#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
12#include <linux/irq.h>
13 13
14#ifndef ack_bad_irq 14#ifndef ack_bad_irq
15static inline void ack_bad_irq(unsigned int irq) 15static inline void ack_bad_irq(unsigned int irq)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2bd73e8f9c0..f4d4120e5128 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -129,6 +129,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
129#define move_pte(pte, prot, old_addr, new_addr) (pte) 129#define move_pte(pte, prot, old_addr, new_addr) (pte)
130#endif 130#endif
131 131
132#ifndef flush_tlb_fix_spurious_fault
133#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
134#endif
135
132#ifndef pgprot_noncached 136#ifndef pgprot_noncached
133#define pgprot_noncached(prot) (prot) 137#define pgprot_noncached(prot) (prot)
134#endif 138#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8a92a170fb7d..f4229fb315e1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -220,6 +220,8 @@
220 \ 220 \
221 BUG_TABLE \ 221 BUG_TABLE \
222 \ 222 \
223 JUMP_TABLE \
224 \
223 /* PCI quirks */ \ 225 /* PCI quirks */ \
224 .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ 226 .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \
225 VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ 227 VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \
@@ -563,6 +565,14 @@
563#define BUG_TABLE 565#define BUG_TABLE
564#endif 566#endif
565 567
568#define JUMP_TABLE \
569 . = ALIGN(8); \
570 __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) { \
571 VMLINUX_SYMBOL(__start___jump_table) = .; \
572 *(__jump_table) \
573 VMLINUX_SYMBOL(__stop___jump_table) = .; \
574 }
575
566#ifdef CONFIG_PM_TRACE 576#ifdef CONFIG_PM_TRACE
567#define TRACEDATA \ 577#define TRACEDATA \
568 . = ALIGN(4); \ 578 . = ALIGN(4); \
@@ -677,7 +687,9 @@
677 - LOAD_OFFSET) { \ 687 - LOAD_OFFSET) { \
678 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 688 VMLINUX_SYMBOL(__per_cpu_start) = .; \
679 *(.data..percpu..first) \ 689 *(.data..percpu..first) \
690 . = ALIGN(PAGE_SIZE); \
680 *(.data..percpu..page_aligned) \ 691 *(.data..percpu..page_aligned) \
692 *(.data..percpu..readmostly) \
681 *(.data..percpu) \ 693 *(.data..percpu) \
682 *(.data..percpu..shared_aligned) \ 694 *(.data..percpu..shared_aligned) \
683 VMLINUX_SYMBOL(__per_cpu_end) = .; \ 695 VMLINUX_SYMBOL(__per_cpu_end) = .; \
@@ -703,7 +715,9 @@
703 VMLINUX_SYMBOL(__per_cpu_load) = .; \ 715 VMLINUX_SYMBOL(__per_cpu_load) = .; \
704 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 716 VMLINUX_SYMBOL(__per_cpu_start) = .; \
705 *(.data..percpu..first) \ 717 *(.data..percpu..first) \
718 . = ALIGN(PAGE_SIZE); \
706 *(.data..percpu..page_aligned) \ 719 *(.data..percpu..page_aligned) \
720 *(.data..percpu..readmostly) \
707 *(.data..percpu) \ 721 *(.data..percpu) \
708 *(.data..percpu..shared_aligned) \ 722 *(.data..percpu..shared_aligned) \
709 VMLINUX_SYMBOL(__per_cpu_end) = .; \ 723 VMLINUX_SYMBOL(__per_cpu_end) = .; \
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
index 7e3d2859be50..1d0ef1ae8036 100644
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,8 +25,6 @@ static inline u32 acpi_pm_read_early(void)
25 return acpi_pm_read_verified() & ACPI_PM_MASK; 25 return acpi_pm_read_verified() & ACPI_PM_MASK;
26} 26}
27 27
28extern void pmtimer_wait(unsigned);
29
30#else 28#else
31 29
32static inline u32 acpi_pm_read_early(void) 30static inline u32 acpi_pm_read_early(void)
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
new file mode 100644
index 000000000000..7fff521d7eb5
--- /dev/null
+++ b/include/linux/ceph/auth.h
@@ -0,0 +1,92 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include <linux/ceph/types.h>
5#include <linux/ceph/buffer.h>
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 const char *name;
19
20 /*
21 * true if we are authenticated and can connect to
22 * services.
23 */
24 int (*is_authenticated)(struct ceph_auth_client *ac);
25
26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
33 * build requests and process replies during monitor
34 * handshake. if handle_reply returns -EAGAIN, we build
35 * another request.
36 */
37 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
38 int (*handle_reply)(struct ceph_auth_client *ac, int result,
39 void *buf, void *end);
40
41 /*
42 * Create authorizer for connecting to a service, and verify
43 * the response to authenticate the service.
44 */
45 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
46 struct ceph_authorizer **a,
47 void **buf, size_t *len,
48 void **reply_buf, size_t *reply_len);
49 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
50 struct ceph_authorizer *a, size_t len);
51 void (*destroy_authorizer)(struct ceph_auth_client *ac,
52 struct ceph_authorizer *a);
53 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
54 int peer_type);
55
56 /* reset when we (re)connect to a monitor */
57 void (*reset)(struct ceph_auth_client *ac);
58
59 void (*destroy)(struct ceph_auth_client *ac);
60};
61
62struct ceph_auth_client {
63 u32 protocol; /* CEPH_AUTH_* */
64 void *private; /* for use by protocol implementation */
65 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
66
67 bool negotiating; /* true if negotiating protocol */
68 const char *name; /* entity name */
69 u64 global_id; /* our unique id in system */
70 const char *secret; /* our secret key */
71 unsigned want_keys; /* which services we want */
72};
73
74extern struct ceph_auth_client *ceph_auth_init(const char *name,
75 const char *secret);
76extern void ceph_auth_destroy(struct ceph_auth_client *ac);
77
78extern void ceph_auth_reset(struct ceph_auth_client *ac);
79
80extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
81 void *buf, size_t len);
82extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
83 void *buf, size_t len,
84 void *reply_buf, size_t reply_len);
85extern int ceph_entity_name_encode(const char *name, void **p, void *end);
86
87extern int ceph_build_auth(struct ceph_auth_client *ac,
88 void *msg_buf, size_t msg_len);
89
90extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
91
92#endif
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/include/linux/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
new file mode 100644
index 000000000000..aa2e19182d99
--- /dev/null
+++ b/include/linux/ceph/ceph_debug.h
@@ -0,0 +1,38 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug("%.*s %12.12s:%-4d : " fmt, \
18 8 - (int)sizeof(KBUILD_MODNAME), " ", \
19 ceph_file_part(__FILE__, sizeof(__FILE__)), \
20 __LINE__, ##__VA_ARGS__)
21# else
22/* faux printk call just to see any compiler warnings. */
23# define dout(fmt, ...) do { \
24 if (0) \
25 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
26 } while (0)
27# endif
28
29#else
30
31/*
32 * or, just wrap pr_debug
33 */
34# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
35
36#endif
37
38#endif
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
new file mode 100644
index 000000000000..5babb8e95352
--- /dev/null
+++ b/include/linux/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef FS_CEPH_FRAG_H
2#define FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
new file mode 100644
index 000000000000..c3c74aef289d
--- /dev/null
+++ b/include/linux/ceph/ceph_fs.h
@@ -0,0 +1,729 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef CEPH_FS_H
13#define CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * subprotocol versions. when specific messages types or high-level
20 * protocols change, bump the affected components. we keep rev
21 * internal cluster protocols separately from the public,
22 * client-facing protocol.
23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */
30
31
32#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
34
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31
37
38
39/*
40 * feature bits
41 */
42#define CEPH_FEATURE_UID (1<<0)
43#define CEPH_FEATURE_NOSRCADDR (1<<1)
44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
45#define CEPH_FEATURE_FLOCK (1<<3)
46
47
48/*
49 * ceph_file_layout - describe data layout for a file/inode
50 */
51struct ceph_file_layout {
52 /* file -> object mapping */
53 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
54 of page size. */
55 __le32 fl_stripe_count; /* over this many objects */
56 __le32 fl_object_size; /* until objects are this big, then move to
57 new objects */
58 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
59
60 /* pg -> disk layout */
61 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
62
63 /* object -> pg layout */
64 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
65 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
66} __attribute__ ((packed));
67
68#define CEPH_MIN_STRIPE_UNIT 65536
69
70int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
71
72
73/* crypto algorithms */
74#define CEPH_CRYPTO_NONE 0x0
75#define CEPH_CRYPTO_AES 0x1
76
77#define CEPH_AES_IV "cephsageyudagreg"
78
79/* security/authentication protocols */
80#define CEPH_AUTH_UNKNOWN 0x0
81#define CEPH_AUTH_NONE 0x1
82#define CEPH_AUTH_CEPHX 0x2
83
84#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
85
86
87/*********************************************
88 * message layer
89 */
90
91/*
92 * message types
93 */
94
95/* misc */
96#define CEPH_MSG_SHUTDOWN 1
97#define CEPH_MSG_PING 2
98
99/* client <-> monitor */
100#define CEPH_MSG_MON_MAP 4
101#define CEPH_MSG_MON_GET_MAP 5
102#define CEPH_MSG_STATFS 13
103#define CEPH_MSG_STATFS_REPLY 14
104#define CEPH_MSG_MON_SUBSCRIBE 15
105#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
106#define CEPH_MSG_AUTH 17
107#define CEPH_MSG_AUTH_REPLY 18
108
109/* client <-> mds */
110#define CEPH_MSG_MDS_MAP 21
111
112#define CEPH_MSG_CLIENT_SESSION 22
113#define CEPH_MSG_CLIENT_RECONNECT 23
114
115#define CEPH_MSG_CLIENT_REQUEST 24
116#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
117#define CEPH_MSG_CLIENT_REPLY 26
118#define CEPH_MSG_CLIENT_CAPS 0x310
119#define CEPH_MSG_CLIENT_LEASE 0x311
120#define CEPH_MSG_CLIENT_SNAP 0x312
121#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
122
123/* pool ops */
124#define CEPH_MSG_POOLOP_REPLY 48
125#define CEPH_MSG_POOLOP 49
126
127
128/* osd */
129#define CEPH_MSG_OSD_MAP 41
130#define CEPH_MSG_OSD_OP 42
131#define CEPH_MSG_OSD_OPREPLY 43
132
133/* pool operations */
134enum {
135 POOL_OP_CREATE = 0x01,
136 POOL_OP_DELETE = 0x02,
137 POOL_OP_AUID_CHANGE = 0x03,
138 POOL_OP_CREATE_SNAP = 0x11,
139 POOL_OP_DELETE_SNAP = 0x12,
140 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
141 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
142};
143
144struct ceph_mon_request_header {
145 __le64 have_version;
146 __le16 session_mon;
147 __le64 session_mon_tid;
148} __attribute__ ((packed));
149
150struct ceph_mon_statfs {
151 struct ceph_mon_request_header monhdr;
152 struct ceph_fsid fsid;
153} __attribute__ ((packed));
154
155struct ceph_statfs {
156 __le64 kb, kb_used, kb_avail;
157 __le64 num_objects;
158} __attribute__ ((packed));
159
160struct ceph_mon_statfs_reply {
161 struct ceph_fsid fsid;
162 __le64 version;
163 struct ceph_statfs st;
164} __attribute__ ((packed));
165
166const char *ceph_pool_op_name(int op);
167
168struct ceph_mon_poolop {
169 struct ceph_mon_request_header monhdr;
170 struct ceph_fsid fsid;
171 __le32 pool;
172 __le32 op;
173 __le64 auid;
174 __le64 snapid;
175 __le32 name_len;
176} __attribute__ ((packed));
177
178struct ceph_mon_poolop_reply {
179 struct ceph_mon_request_header monhdr;
180 struct ceph_fsid fsid;
181 __le32 reply_code;
182 __le32 epoch;
183 char has_data;
184 char data[0];
185} __attribute__ ((packed));
186
187struct ceph_mon_unmanaged_snap {
188 __le64 snapid;
189} __attribute__ ((packed));
190
191struct ceph_osd_getmap {
192 struct ceph_mon_request_header monhdr;
193 struct ceph_fsid fsid;
194 __le32 start;
195} __attribute__ ((packed));
196
197struct ceph_mds_getmap {
198 struct ceph_mon_request_header monhdr;
199 struct ceph_fsid fsid;
200} __attribute__ ((packed));
201
202struct ceph_client_mount {
203 struct ceph_mon_request_header monhdr;
204} __attribute__ ((packed));
205
206struct ceph_mon_subscribe_item {
207 __le64 have_version; __le64 have;
208 __u8 onetime;
209} __attribute__ ((packed));
210
211struct ceph_mon_subscribe_ack {
212 __le32 duration; /* seconds */
213 struct ceph_fsid fsid;
214} __attribute__ ((packed));
215
216/*
217 * mds states
218 * > 0 -> in
219 * <= 0 -> out
220 */
221#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
222#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
223 empty log. */
224#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
225#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
226#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
227#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
228#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
229
230#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
231#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
232 operations (import, rename, etc.) */
233#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
234#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
235#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
236#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
237#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
238
239extern const char *ceph_mds_state_name(int s);
240
241
242/*
243 * metadata lock types.
244 * - these are bitmasks.. we can compose them
245 * - they also define the lock ordering by the MDS
246 * - a few of these are internal to the mds
247 */
248#define CEPH_LOCK_DVERSION 1
249#define CEPH_LOCK_DN 2
250#define CEPH_LOCK_ISNAP 16
251#define CEPH_LOCK_IVERSION 32 /* mds internal */
252#define CEPH_LOCK_IFILE 64
253#define CEPH_LOCK_IAUTH 128
254#define CEPH_LOCK_ILINK 256
255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
256#define CEPH_LOCK_INEST 1024 /* mds internal */
257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
260
261/* client_session ops */
262enum {
263 CEPH_SESSION_REQUEST_OPEN,
264 CEPH_SESSION_OPEN,
265 CEPH_SESSION_REQUEST_CLOSE,
266 CEPH_SESSION_CLOSE,
267 CEPH_SESSION_REQUEST_RENEWCAPS,
268 CEPH_SESSION_RENEWCAPS,
269 CEPH_SESSION_STALE,
270 CEPH_SESSION_RECALL_STATE,
271};
272
273extern const char *ceph_session_op_name(int op);
274
275struct ceph_mds_session_head {
276 __le32 op;
277 __le64 seq;
278 struct ceph_timespec stamp;
279 __le32 max_caps, max_leases;
280} __attribute__ ((packed));
281
282/* client_request */
283/*
284 * metadata ops.
285 * & 0x001000 -> write op
286 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
287 & & 0x100000 -> use weird ino/path trace
288 */
289#define CEPH_MDS_OP_WRITE 0x001000
290enum {
291 CEPH_MDS_OP_LOOKUP = 0x00100,
292 CEPH_MDS_OP_GETATTR = 0x00101,
293 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
294 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
295
296 CEPH_MDS_OP_SETXATTR = 0x01105,
297 CEPH_MDS_OP_RMXATTR = 0x01106,
298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
302 CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
303
304 CEPH_MDS_OP_MKNOD = 0x01201,
305 CEPH_MDS_OP_LINK = 0x01202,
306 CEPH_MDS_OP_UNLINK = 0x01203,
307 CEPH_MDS_OP_RENAME = 0x01204,
308 CEPH_MDS_OP_MKDIR = 0x01220,
309 CEPH_MDS_OP_RMDIR = 0x01221,
310 CEPH_MDS_OP_SYMLINK = 0x01222,
311
312 CEPH_MDS_OP_CREATE = 0x01301,
313 CEPH_MDS_OP_OPEN = 0x00302,
314 CEPH_MDS_OP_READDIR = 0x00305,
315
316 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
317 CEPH_MDS_OP_MKSNAP = 0x01400,
318 CEPH_MDS_OP_RMSNAP = 0x01401,
319 CEPH_MDS_OP_LSSNAP = 0x00402,
320};
321
322extern const char *ceph_mds_op_name(int op);
323
324
325#define CEPH_SETATTR_MODE 1
326#define CEPH_SETATTR_UID 2
327#define CEPH_SETATTR_GID 4
328#define CEPH_SETATTR_MTIME 8
329#define CEPH_SETATTR_ATIME 16
330#define CEPH_SETATTR_SIZE 32
331#define CEPH_SETATTR_CTIME 64
332
333union ceph_mds_request_args {
334 struct {
335 __le32 mask; /* CEPH_CAP_* */
336 } __attribute__ ((packed)) getattr;
337 struct {
338 __le32 mode;
339 __le32 uid;
340 __le32 gid;
341 struct ceph_timespec mtime;
342 struct ceph_timespec atime;
343 __le64 size, old_size; /* old_size needed by truncate */
344 __le32 mask; /* CEPH_SETATTR_* */
345 } __attribute__ ((packed)) setattr;
346 struct {
347 __le32 frag; /* which dir fragment */
348 __le32 max_entries; /* how many dentries to grab */
349 __le32 max_bytes;
350 } __attribute__ ((packed)) readdir;
351 struct {
352 __le32 mode;
353 __le32 rdev;
354 } __attribute__ ((packed)) mknod;
355 struct {
356 __le32 mode;
357 } __attribute__ ((packed)) mkdir;
358 struct {
359 __le32 flags;
360 __le32 mode;
361 __le32 stripe_unit; /* layout for newly created file */
362 __le32 stripe_count; /* ... */
363 __le32 object_size;
364 __le32 file_replication;
365 __le32 preferred;
366 } __attribute__ ((packed)) open;
367 struct {
368 __le32 flags;
369 } __attribute__ ((packed)) setxattr;
370 struct {
371 struct ceph_file_layout layout;
372 } __attribute__ ((packed)) setlayout;
373 struct {
374 __u8 rule; /* currently fcntl or flock */
375 __u8 type; /* shared, exclusive, remove*/
376 __le64 pid; /* process id requesting the lock */
377 __le64 pid_namespace;
378 __le64 start; /* initial location to lock */
379 __le64 length; /* num bytes to lock from start */
380 __u8 wait; /* will caller wait for lock to become available? */
381 } __attribute__ ((packed)) filelock_change;
382} __attribute__ ((packed));
383
384#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
385#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
386
387struct ceph_mds_request_head {
388 __le64 oldest_client_tid;
389 __le32 mdsmap_epoch; /* on client */
390 __le32 flags; /* CEPH_MDS_FLAG_* */
391 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
392 __le16 num_releases; /* # include cap/lease release records */
393 __le32 op; /* mds op code */
394 __le32 caller_uid, caller_gid;
395 __le64 ino; /* use this ino for openc, mkdir, mknod,
396 etc. (if replaying) */
397 union ceph_mds_request_args args;
398} __attribute__ ((packed));
399
400/* cap/lease release record */
401struct ceph_mds_request_release {
402 __le64 ino, cap_id; /* ino and unique cap id */
403 __le32 caps, wanted; /* new issued, wanted */
404 __le32 seq, issue_seq, mseq;
405 __le32 dname_seq; /* if releasing a dentry lease, a */
406 __le32 dname_len; /* string follows. */
407} __attribute__ ((packed));
408
409/* client reply */
410struct ceph_mds_reply_head {
411 __le32 op;
412 __le32 result;
413 __le32 mdsmap_epoch;
414 __u8 safe; /* true if committed to disk */
415 __u8 is_dentry, is_target; /* true if dentry, target inode records
416 are included with reply */
417} __attribute__ ((packed));
418
419/* one for each node split */
420struct ceph_frag_tree_split {
421 __le32 frag; /* this frag splits... */
422 __le32 by; /* ...by this many bits */
423} __attribute__ ((packed));
424
425struct ceph_frag_tree_head {
426 __le32 nsplits; /* num ceph_frag_tree_split records */
427 struct ceph_frag_tree_split splits[];
428} __attribute__ ((packed));
429
430/* capability issue, for bundling with mds reply */
431struct ceph_mds_reply_cap {
432 __le32 caps, wanted; /* caps issued, wanted */
433 __le64 cap_id;
434 __le32 seq, mseq;
435 __le64 realm; /* snap realm */
436 __u8 flags; /* CEPH_CAP_FLAG_* */
437} __attribute__ ((packed));
438
439#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
440
441/* inode record, for bundling with mds reply */
442struct ceph_mds_reply_inode {
443 __le64 ino;
444 __le64 snapid;
445 __le32 rdev;
446 __le64 version; /* inode version */
447 __le64 xattr_version; /* version for xattr blob */
448 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
449 struct ceph_file_layout layout;
450 struct ceph_timespec ctime, mtime, atime;
451 __le32 time_warp_seq;
452 __le64 size, max_size, truncate_size;
453 __le32 truncate_seq;
454 __le32 mode, uid, gid;
455 __le32 nlink;
456 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
457 struct ceph_timespec rctime;
458 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
459} __attribute__ ((packed));
460/* followed by frag array, then symlink string, then xattr blob */
461
462/* reply_lease follows dname, and reply_inode */
463struct ceph_mds_reply_lease {
464 __le16 mask; /* lease type(s) */
465 __le32 duration_ms; /* lease duration */
466 __le32 seq;
467} __attribute__ ((packed));
468
469struct ceph_mds_reply_dirfrag {
470 __le32 frag; /* fragment */
471 __le32 auth; /* auth mds, if this is a delegation point */
472 __le32 ndist; /* number of mds' this is replicated on */
473 __le32 dist[];
474} __attribute__ ((packed));
475
476#define CEPH_LOCK_FCNTL 1
477#define CEPH_LOCK_FLOCK 2
478
479#define CEPH_LOCK_SHARED 1
480#define CEPH_LOCK_EXCL 2
481#define CEPH_LOCK_UNLOCK 4
482
483struct ceph_filelock {
484 __le64 start;/* file offset to start lock at */
485 __le64 length; /* num bytes to lock; 0 for all following start */
486 __le64 client; /* which client holds the lock */
487 __le64 pid; /* process id holding the lock on the client */
488 __le64 pid_namespace;
489 __u8 type; /* shared lock, exclusive lock, or unlock */
490} __attribute__ ((packed));
491
492
493/* file access modes */
494#define CEPH_FILE_MODE_PIN 0
495#define CEPH_FILE_MODE_RD 1
496#define CEPH_FILE_MODE_WR 2
497#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
498#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
499#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
500
501int ceph_flags_to_mode(int flags);
502
503
504/* capability bits */
505#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
506
507/* generic cap bits */
508#define CEPH_CAP_GSHARED 1 /* client can reads */
509#define CEPH_CAP_GEXCL 2 /* client can read and update */
510#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
511#define CEPH_CAP_GRD 8 /* (file) client can read */
512#define CEPH_CAP_GWR 16 /* (file) client can write */
513#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
514#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
515#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
516
517/* per-lock shift */
518#define CEPH_CAP_SAUTH 2
519#define CEPH_CAP_SLINK 4
520#define CEPH_CAP_SXATTR 6
521#define CEPH_CAP_SFILE 8
522#define CEPH_CAP_SFLOCK 20
523
524#define CEPH_CAP_BITS 22
525
526/* composed values */
527#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
528#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
529#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
530#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
531#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
532#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
533#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
534#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
535#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
536#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
537#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
538#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
539#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
540#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
541#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
542#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
543#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
544
545
546/* cap masks (for getattr) */
547#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
548#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
549#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
550#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
551#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
552#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
553#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
554#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
555#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
556#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
557#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
558#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
559#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
560 CEPH_CAP_AUTH_SHARED | \
561 CEPH_CAP_LINK_SHARED | \
562 CEPH_CAP_FILE_SHARED | \
563 CEPH_CAP_XATTR_SHARED)
564
565#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
566 CEPH_CAP_LINK_SHARED | \
567 CEPH_CAP_XATTR_SHARED | \
568 CEPH_CAP_FILE_SHARED)
569#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
570 CEPH_CAP_FILE_CACHE)
571
572#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
573 CEPH_CAP_LINK_EXCL | \
574 CEPH_CAP_XATTR_EXCL | \
575 CEPH_CAP_FILE_EXCL)
576#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
577 CEPH_CAP_FILE_EXCL)
578#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
579#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
580 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
581 CEPH_CAP_PIN)
582
583#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
584 CEPH_LOCK_IXATTR)
585
586int ceph_caps_for_mode(int mode);
587
588enum {
589 CEPH_CAP_OP_GRANT, /* mds->client grant */
590 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
591 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
592 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
593 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
594 CEPH_CAP_OP_UPDATE, /* client->mds update */
595 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
596 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
597 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
598 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
599 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
600 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
601 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
602};
603
604extern const char *ceph_cap_op_name(int op);
605
606/*
607 * caps message, used for capability callbacks, acks, requests, etc.
608 */
609struct ceph_mds_caps {
610 __le32 op; /* CEPH_CAP_OP_* */
611 __le64 ino, realm;
612 __le64 cap_id;
613 __le32 seq, issue_seq;
614 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
615 __le32 migrate_seq;
616 __le64 snap_follows;
617 __le32 snap_trace_len;
618
619 /* authlock */
620 __le32 uid, gid, mode;
621
622 /* linklock */
623 __le32 nlink;
624
625 /* xattrlock */
626 __le32 xattr_len;
627 __le64 xattr_version;
628
629 /* filelock */
630 __le64 size, max_size, truncate_size;
631 __le32 truncate_seq;
632 struct ceph_timespec mtime, atime, ctime;
633 struct ceph_file_layout layout;
634 __le32 time_warp_seq;
635} __attribute__ ((packed));
636
637/* cap release msg head */
638struct ceph_mds_cap_release {
639 __le32 num; /* number of cap_items that follow */
640} __attribute__ ((packed));
641
642struct ceph_mds_cap_item {
643 __le64 ino;
644 __le64 cap_id;
645 __le32 migrate_seq, seq;
646} __attribute__ ((packed));
647
648#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
649#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
650#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
651#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
652
653extern const char *ceph_lease_op_name(int o);
654
655/* lease msg header */
656struct ceph_mds_lease {
657 __u8 action; /* CEPH_MDS_LEASE_* */
658 __le16 mask; /* which lease */
659 __le64 ino;
660 __le64 first, last; /* snap range */
661 __le32 seq;
662 __le32 duration_ms; /* duration of renewal */
663} __attribute__ ((packed));
664/* followed by a __le32+string for dname */
665
666/* client reconnect */
667struct ceph_mds_cap_reconnect {
668 __le64 cap_id;
669 __le32 wanted;
670 __le32 issued;
671 __le64 snaprealm;
672 __le64 pathbase; /* base ino for our path to this ino */
673 __le32 flock_len; /* size of flock state blob, if any */
674} __attribute__ ((packed));
675/* followed by flock blob */
676
677struct ceph_mds_cap_reconnect_v1 {
678 __le64 cap_id;
679 __le32 wanted;
680 __le32 issued;
681 __le64 size;
682 struct ceph_timespec mtime, atime;
683 __le64 snaprealm;
684 __le64 pathbase; /* base ino for our path to this ino */
685} __attribute__ ((packed));
686
687struct ceph_mds_snaprealm_reconnect {
688 __le64 ino; /* snap realm base */
689 __le64 seq; /* snap seq for this snap realm */
690 __le64 parent; /* parent realm */
691} __attribute__ ((packed));
692
693/*
694 * snaps
695 */
696enum {
697 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
698 CEPH_SNAP_OP_CREATE,
699 CEPH_SNAP_OP_DESTROY,
700 CEPH_SNAP_OP_SPLIT,
701};
702
703extern const char *ceph_snap_op_name(int o);
704
705/* snap msg header */
706struct ceph_mds_snap_head {
707 __le32 op; /* CEPH_SNAP_OP_* */
708 __le64 split; /* ino to split off, if any */
709 __le32 num_split_inos; /* # inos belonging to new child realm */
710 __le32 num_split_realms; /* # child realms udner new child realm */
711 __le32 trace_len; /* size of snap trace blob */
712} __attribute__ ((packed));
713/* followed by split ino list, then split realms, then the trace blob */
714
715/*
716 * encode info about a snaprealm, as viewed by a client
717 */
718struct ceph_mds_snap_realm {
719 __le64 ino; /* ino */
720 __le64 created; /* snap: when created */
721 __le64 parent; /* ino: parent realm */
722 __le64 parent_since; /* snap: same parent since */
723 __le64 seq; /* snap: version */
724 __le32 num_snaps;
725 __le32 num_prior_parent_snaps;
726} __attribute__ ((packed));
727/* followed by my snap list, then prior parent snap list */
728
729#endif
diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
new file mode 100644
index 000000000000..d099c3f90236
--- /dev/null
+++ b/include/linux/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef FS_CEPH_HASH_H
2#define FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
new file mode 100644
index 000000000000..2a79702e092b
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
1#ifndef _FS_CEPH_DEBUGFS_H
2#define _FS_CEPH_DEBUGFS_H
3
4#include "ceph_debug.h"
5#include "types.h"
6
7#define CEPH_DEFINE_SHOW_FUNC(name) \
8static int name##_open(struct inode *inode, struct file *file) \
9{ \
10 struct seq_file *sf; \
11 int ret; \
12 \
13 ret = single_open(file, name, NULL); \
14 sf = file->private_data; \
15 sf->private = inode->i_private; \
16 return ret; \
17} \
18 \
19static const struct file_operations name##_fops = { \
20 .open = name##_open, \
21 .read = seq_read, \
22 .llseek = seq_lseek, \
23 .release = single_release, \
24};
25
26/* debugfs.c */
27extern int ceph_debugfs_init(void);
28extern void ceph_debugfs_cleanup(void);
29extern int ceph_debugfs_client_init(struct ceph_client *client);
30extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
31
32#endif
33
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
new file mode 100644
index 000000000000..c5b6939fb32a
--- /dev/null
+++ b/include/linux/ceph/decode.h
@@ -0,0 +1,201 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
104}
105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
106{
107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
109 WARN_ON(a->in_addr.ss_family == 512);
110}
111
112/*
113 * encoders
114 */
115static inline void ceph_encode_64(void **p, u64 v)
116{
117 put_unaligned_le64(v, (__le64 *)*p);
118 *p += sizeof(u64);
119}
120static inline void ceph_encode_32(void **p, u32 v)
121{
122 put_unaligned_le32(v, (__le32 *)*p);
123 *p += sizeof(u32);
124}
125static inline void ceph_encode_16(void **p, u16 v)
126{
127 put_unaligned_le16(v, (__le16 *)*p);
128 *p += sizeof(u16);
129}
130static inline void ceph_encode_8(void **p, u8 v)
131{
132 *(u8 *)*p = v;
133 (*p)++;
134}
135static inline void ceph_encode_copy(void **p, const void *s, int len)
136{
137 memcpy(*p, s, len);
138 *p += len;
139}
140
141/*
142 * filepath, string encoders
143 */
144static inline void ceph_encode_filepath(void **p, void *end,
145 u64 ino, const char *path)
146{
147 u32 len = path ? strlen(path) : 0;
148 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
149 ceph_encode_8(p, 1);
150 ceph_encode_64(p, ino);
151 ceph_encode_32(p, len);
152 if (len)
153 memcpy(*p, path, len);
154 *p += len;
155}
156
157static inline void ceph_encode_string(void **p, void *end,
158 const char *s, u32 len)
159{
160 BUG_ON(*p + sizeof(len) + len > end);
161 ceph_encode_32(p, len);
162 if (len)
163 memcpy(*p, s, len);
164 *p += len;
165}
166
167#define ceph_encode_need(p, end, n, bad) \
168 do { \
169 if (unlikely(*(p) + (n) > (end))) \
170 goto bad; \
171 } while (0)
172
173#define ceph_encode_64_safe(p, end, v, bad) \
174 do { \
175 ceph_encode_need(p, end, sizeof(u64), bad); \
176 ceph_encode_64(p, v); \
177 } while (0)
178#define ceph_encode_32_safe(p, end, v, bad) \
179 do { \
180 ceph_encode_need(p, end, sizeof(u32), bad); \
181 ceph_encode_32(p, v); \
182 } while (0)
183#define ceph_encode_16_safe(p, end, v, bad) \
184 do { \
185 ceph_encode_need(p, end, sizeof(u16), bad); \
186 ceph_encode_16(p, v); \
187 } while (0)
188
189#define ceph_encode_copy_safe(p, end, pv, n, bad) \
190 do { \
191 ceph_encode_need(p, end, n, bad); \
192 ceph_encode_copy(p, pv, n); \
193 } while (0)
194#define ceph_encode_string_safe(p, end, s, n, bad) \
195 do { \
196 ceph_encode_need(p, end, n, bad); \
197 ceph_encode_string(p, end, s, n); \
198 } while (0)
199
200
201#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
new file mode 100644
index 000000000000..f22b2e941686
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
1#ifndef _FS_CEPH_LIBCEPH_H
2#define _FS_CEPH_LIBCEPH_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/*
25 * Supported features
26 */
27#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
28#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR
29
30/*
31 * mount options
32 */
33#define CEPH_OPT_FSID (1<<0)
34#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
35#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
36#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
37
38#define CEPH_OPT_DEFAULT (0);
39
40#define ceph_set_opt(client, opt) \
41 (client)->options->flags |= CEPH_OPT_##opt;
42#define ceph_test_opt(client, opt) \
43 (!!((client)->options->flags & CEPH_OPT_##opt))
44
45struct ceph_options {
46 int flags;
47 struct ceph_fsid fsid;
48 struct ceph_entity_addr my_addr;
49 int mount_timeout;
50 int osd_idle_ttl;
51 int osd_timeout;
52 int osd_keepalive_timeout;
53
54 /*
55 * any type that can't be simply compared or doesn't need need
56 * to be compared should go beyond this point,
57 * ceph_compare_options() should be updated accordingly
58 */
59
60 struct ceph_entity_addr *mon_addr; /* should be the first
61 pointer type of args */
62 int num_mon;
63 char *name;
64 char *secret;
65};
66
67/*
68 * defaults
69 */
70#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
71#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
72#define CEPH_OSD_KEEPALIVE_DEFAULT 5
73#define CEPH_OSD_IDLE_TTL_DEFAULT 60
74#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
75
76#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
77#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
78
79#define CEPH_AUTH_NAME_DEFAULT "guest"
80
81/*
82 * Delay telling the MDS we no longer want caps, in case we reopen
83 * the file. Delay a minimum amount of time, even if we send a cap
84 * message for some other reason. Otherwise, take the oppotunity to
85 * update the mds to avoid sending another message later.
86 */
87#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
88#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
89
90#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
91
92/* mount state */
93enum {
94 CEPH_MOUNT_MOUNTING,
95 CEPH_MOUNT_MOUNTED,
96 CEPH_MOUNT_UNMOUNTING,
97 CEPH_MOUNT_UNMOUNTED,
98 CEPH_MOUNT_SHUTDOWN,
99};
100
101/*
102 * subtract jiffies
103 */
104static inline unsigned long time_sub(unsigned long a, unsigned long b)
105{
106 BUG_ON(time_after(b, a));
107 return (long)a - (long)b;
108}
109
110struct ceph_mds_client;
111
112/*
113 * per client state
114 *
115 * possibly shared by multiple mount points, if they are
116 * mounting the same ceph filesystem/cluster.
117 */
118struct ceph_client {
119 struct ceph_fsid fsid;
120 bool have_fsid;
121
122 void *private;
123
124 struct ceph_options *options;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 wait_queue_head_t auth_wq;
128 int auth_err;
129
130 int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
131
132 u32 supported_features;
133 u32 required_features;
134
135 struct ceph_messenger *msgr; /* messenger instance */
136 struct ceph_mon_client monc;
137 struct ceph_osd_client osdc;
138
139#ifdef CONFIG_DEBUG_FS
140 struct dentry *debugfs_dir;
141 struct dentry *debugfs_monmap;
142 struct dentry *debugfs_osdmap;
143#endif
144};
145
146
147
148/*
149 * snapshots
150 */
151
152/*
153 * A "snap context" is the set of existing snapshots when we
154 * write data. It is used by the OSD to guide its COW behavior.
155 *
156 * The ceph_snap_context is refcounted, and attached to each dirty
157 * page, indicating which context the dirty data belonged when it was
158 * dirtied.
159 */
160struct ceph_snap_context {
161 atomic_t nref;
162 u64 seq;
163 int num_snaps;
164 u64 snaps[];
165};
166
167static inline struct ceph_snap_context *
168ceph_get_snap_context(struct ceph_snap_context *sc)
169{
170 /*
171 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
172 atomic_read(&sc->nref)+1);
173 */
174 if (sc)
175 atomic_inc(&sc->nref);
176 return sc;
177}
178
179static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
180{
181 if (!sc)
182 return;
183 /*
184 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
185 atomic_read(&sc->nref)-1);
186 */
187 if (atomic_dec_and_test(&sc->nref)) {
188 /*printk(" deleting snap_context %p\n", sc);*/
189 kfree(sc);
190 }
191}
192
193/*
194 * calculate the number of pages a given length and offset map onto,
195 * if we align the data.
196 */
197static inline int calc_pages_for(u64 off, u64 len)
198{
199 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
200 (off >> PAGE_CACHE_SHIFT);
201}
202
203/* ceph_common.c */
204extern const char *ceph_msg_type_name(int type);
205extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
206extern struct kmem_cache *ceph_inode_cachep;
207extern struct kmem_cache *ceph_cap_cachep;
208extern struct kmem_cache *ceph_dentry_cachep;
209extern struct kmem_cache *ceph_file_cachep;
210
211extern int ceph_parse_options(struct ceph_options **popt, char *options,
212 const char *dev_name, const char *dev_name_end,
213 int (*parse_extra_token)(char *c, void *private),
214 void *private);
215extern void ceph_destroy_options(struct ceph_options *opt);
216extern int ceph_compare_options(struct ceph_options *new_opt,
217 struct ceph_client *client);
218extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
219 void *private);
220extern u64 ceph_client_id(struct ceph_client *client);
221extern void ceph_destroy_client(struct ceph_client *client);
222extern int __ceph_open_session(struct ceph_client *client,
223 unsigned long started);
224extern int ceph_open_session(struct ceph_client *client);
225
226/* pagevec.c */
227extern void ceph_release_page_vector(struct page **pages, int num_pages);
228
229extern struct page **ceph_get_direct_page_vector(const char __user *data,
230 int num_pages,
231 loff_t off, size_t len);
232extern void ceph_put_page_vector(struct page **pages, int num_pages);
233extern void ceph_release_page_vector(struct page **pages, int num_pages);
234extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
235extern int ceph_copy_user_to_page_vector(struct page **pages,
236 const char __user *data,
237 loff_t off, size_t len);
238extern int ceph_copy_to_page_vector(struct page **pages,
239 const char *data,
240 loff_t off, size_t len);
241extern int ceph_copy_from_page_vector(struct page **pages,
242 char *data,
243 loff_t off, size_t len);
244extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
245 loff_t off, size_t len);
246extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
247
248
249#endif /* _FS_CEPH_SUPER_H */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
new file mode 100644
index 000000000000..4c5cb0880bba
--- /dev/null
+++ b/include/linux/ceph/mdsmap.h
@@ -0,0 +1,62 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 bool laggy;
17 u32 *export_targets;
18};
19
20struct ceph_mdsmap {
21 u32 m_epoch, m_client_epoch, m_last_failure;
22 u32 m_root;
23 u32 m_session_timeout; /* seconds */
24 u32 m_session_autoclose; /* seconds */
25 u64 m_max_file_size;
26 u32 m_max_mds; /* size of m_addr, m_state arrays */
27 struct ceph_mds_info *m_info;
28
29 /* which object pools file data can be stored in */
30 int m_num_data_pg_pools;
31 u32 *m_data_pg_pools;
32 u32 m_cas_pg_pool;
33};
34
35static inline struct ceph_entity_addr *
36ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
37{
38 if (w >= m->m_max_mds)
39 return NULL;
40 return &m->m_info[w].addr;
41}
42
43static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
44{
45 BUG_ON(w < 0);
46 if (w >= m->m_max_mds)
47 return CEPH_MDS_STATE_DNE;
48 return m->m_info[w].state;
49}
50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
61
62#endif
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
new file mode 100644
index 000000000000..5956d62c3057
--- /dev/null
+++ b/include/linux/ceph/messenger.h
@@ -0,0 +1,261 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52/* use format string %s%d */
53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
54
55struct ceph_messenger {
56 struct ceph_entity_inst inst; /* my name+address */
57 struct ceph_entity_addr my_enc_addr;
58 struct page *zero_page; /* used in certain error cases */
59
60 bool nocrc;
61
62 /*
63 * the global_seq counts connections i (attempt to) initiate
64 * in order to disambiguate certain connect race conditions.
65 */
66 u32 global_seq;
67 spinlock_t global_seq_lock;
68
69 u32 supported_features;
70 u32 required_features;
71};
72
73/*
74 * a single message. it contains a header (src, dest, message type, etc.),
75 * footer (crc values, mainly), a "front" message body, and possibly a
76 * data payload (stored in some number of pages).
77 */
78struct ceph_msg {
79 struct ceph_msg_header hdr; /* header */
80 struct ceph_msg_footer footer; /* footer */
81 struct kvec front; /* unaligned blobs of message */
82 struct ceph_buffer *middle;
83 struct page **pages; /* data payload. NOT OWNER. */
84 unsigned nr_pages; /* size of page array */
85 struct ceph_pagelist *pagelist; /* instead of pages */
86 struct list_head list_head;
87 struct kref kref;
88 struct bio *bio; /* instead of pages/pagelist */
89 struct bio *bio_iter; /* bio iterator */
90 int bio_seg; /* current bio segment */
91 struct ceph_pagelist *trail; /* the trailing part of the data */
92 bool front_is_vmalloc;
93 bool more_to_follow;
94 bool needs_out_seq;
95 int front_max;
96
97 struct ceph_msgpool *pool;
98};
99
100struct ceph_msg_pos {
101 int page, page_pos; /* which page; offset in page */
102 int data_pos; /* offset in data payload */
103 int did_page_crc; /* true if we've calculated crc for current page */
104};
105
106/* ceph connection fault delay defaults, for exponential backoff */
107#define BASE_DELAY_INTERVAL (HZ/2)
108#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
109
110/*
111 * ceph_connection state bit flags
112 *
113 * QUEUED and BUSY are used together to ensure that only a single
114 * thread is currently opening, reading or writing data to the socket.
115 */
116#define LOSSYTX 0 /* we can close channel or drop messages on errors */
117#define CONNECTING 1
118#define NEGOTIATING 2
119#define KEEPALIVE_PENDING 3
120#define WRITE_PENDING 4 /* we have data ready to send */
121#define QUEUED 5 /* there is work queued on this connection */
122#define BUSY 6 /* work is being done */
123#define STANDBY 8 /* no outgoing messages, socket closed. we keep
124 * the ceph_connection around to maintain shared
125 * state with the peer. */
126#define CLOSED 10 /* we've closed the connection */
127#define SOCK_CLOSED 11 /* socket state changed to closed */
128#define OPENING 13 /* open connection w/ (possibly new) peer */
129#define DEAD 14 /* dead, about to kfree */
130
131/*
132 * A single connection with another host.
133 *
134 * We maintain a queue of outgoing messages, and some session state to
135 * ensure that we can preserve the lossless, ordered delivery of
136 * messages in the case of a TCP disconnect.
137 */
138struct ceph_connection {
139 void *private;
140 atomic_t nref;
141
142 const struct ceph_connection_operations *ops;
143
144 struct ceph_messenger *msgr;
145 struct socket *sock;
146 unsigned long state; /* connection state (see flags above) */
147 const char *error_msg; /* error message, if any */
148
149 struct ceph_entity_addr peer_addr; /* peer address */
150 struct ceph_entity_name peer_name; /* peer name */
151 struct ceph_entity_addr peer_addr_for_me;
152 unsigned peer_features;
153 u32 connect_seq; /* identify the most recent connection
154 attempt for this connection, client */
155 u32 peer_global_seq; /* peer's global seq for this connection */
156
157 int auth_retry; /* true if we need a newer authorizer */
158 void *auth_reply_buf; /* where to put the authorizer reply */
159 int auth_reply_buf_len;
160
161 struct mutex mutex;
162
163 /* out queue */
164 struct list_head out_queue;
165 struct list_head out_sent; /* sending or sent but unacked */
166 u64 out_seq; /* last message queued for send */
167 bool out_keepalive_pending;
168
169 u64 in_seq, in_seq_acked; /* last message received, acked */
170
171 /* connection negotiation temps */
172 char in_banner[CEPH_BANNER_MAX_LEN];
173 union {
174 struct { /* outgoing connection */
175 struct ceph_msg_connect out_connect;
176 struct ceph_msg_connect_reply in_reply;
177 };
178 struct { /* incoming */
179 struct ceph_msg_connect in_connect;
180 struct ceph_msg_connect_reply out_reply;
181 };
182 };
183 struct ceph_entity_addr actual_peer_addr;
184
185 /* message out temps */
186 struct ceph_msg *out_msg; /* sending message (== tail of
187 out_sent) */
188 bool out_msg_done;
189 struct ceph_msg_pos out_msg_pos;
190
191 struct kvec out_kvec[8], /* sending header/footer data */
192 *out_kvec_cur;
193 int out_kvec_left; /* kvec's left in out_kvec */
194 int out_skip; /* skip this many bytes */
195 int out_kvec_bytes; /* total bytes left */
196 bool out_kvec_is_msg; /* kvec refers to out_msg */
197 int out_more; /* there is more data after the kvecs */
198 __le64 out_temp_ack; /* for writing an ack */
199
200 /* message in temps */
201 struct ceph_msg_header in_hdr;
202 struct ceph_msg *in_msg;
203 struct ceph_msg_pos in_msg_pos;
204 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
205
206 char in_tag; /* protocol control byte */
207 int in_base_pos; /* bytes read */
208 __le64 in_temp_ack; /* for reading an ack */
209
210 struct delayed_work work; /* send|recv work */
211 unsigned long delay; /* current delay interval */
212};
213
214
215extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
216extern int ceph_parse_ips(const char *c, const char *end,
217 struct ceph_entity_addr *addr,
218 int max_count, int *count);
219
220
221extern int ceph_msgr_init(void);
222extern void ceph_msgr_exit(void);
223extern void ceph_msgr_flush(void);
224
225extern struct ceph_messenger *ceph_messenger_create(
226 struct ceph_entity_addr *myaddr,
227 u32 features, u32 required);
228extern void ceph_messenger_destroy(struct ceph_messenger *);
229
230extern void ceph_con_init(struct ceph_messenger *msgr,
231 struct ceph_connection *con);
232extern void ceph_con_open(struct ceph_connection *con,
233 struct ceph_entity_addr *addr);
234extern bool ceph_con_opened(struct ceph_connection *con);
235extern void ceph_con_close(struct ceph_connection *con);
236extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
237extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
238extern void ceph_con_revoke_message(struct ceph_connection *con,
239 struct ceph_msg *msg);
240extern void ceph_con_keepalive(struct ceph_connection *con);
241extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
242extern void ceph_con_put(struct ceph_connection *con);
243
244extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
245extern void ceph_msg_kfree(struct ceph_msg *m);
246
247
248static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
249{
250 kref_get(&msg->kref);
251 return msg;
252}
253extern void ceph_msg_last_put(struct kref *kref);
254static inline void ceph_msg_put(struct ceph_msg *msg)
255{
256 kref_put(&msg->kref, ceph_msg_last_put);
257}
258
259extern void ceph_msg_dump(struct ceph_msg *msg);
260
261#endif
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
new file mode 100644
index 000000000000..545f85917780
--- /dev/null
+++ b/include/linux/ceph/mon_client.h
@@ -0,0 +1,122 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/rbtree.h>
7
8#include "messenger.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_generic_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
45 * to the caller
46 */
47struct ceph_mon_generic_request {
48 struct kref kref;
49 u64 tid;
50 struct rb_node node;
51 int result;
52 void *buf;
53 int buf_len;
54 struct completion completion;
55 struct ceph_msg *request; /* original request */
56 struct ceph_msg *reply; /* and reply */
57};
58
59struct ceph_mon_client {
60 struct ceph_client *client;
61 struct ceph_monmap *monmap;
62
63 struct mutex mutex;
64 struct delayed_work delayed_work;
65
66 struct ceph_auth_client *auth;
67 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
68 int pending_auth;
69
70 bool hunting;
71 int cur_mon; /* last monitor i contacted */
72 unsigned long sub_sent, sub_renew_after;
73 struct ceph_connection *con;
74 bool have_fsid;
75
76 /* pending generic requests */
77 struct rb_root generic_request_tree;
78 int num_generic_requests;
79 u64 last_tid;
80
81 /* mds/osd map */
82 int want_mdsmap;
83 int want_next_osdmap; /* 1 = want, 2 = want+asked */
84 u32 have_osdmap, have_mdsmap;
85
86#ifdef CONFIG_DEBUG_FS
87 struct dentry *debugfs_file;
88#endif
89};
90
91extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
92extern int ceph_monmap_contains(struct ceph_monmap *m,
93 struct ceph_entity_addr *addr);
94
95extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
96extern void ceph_monc_stop(struct ceph_mon_client *monc);
97
98/*
99 * The model here is to indicate that we need a new map of at least
100 * epoch @want, and also call in when we receive a map. We will
101 * periodically rerequest the map from the monitor cluster until we
102 * get what we want.
103 */
104extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
105extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
106
107extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
108
109extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
110 struct ceph_statfs *buf);
111
112extern int ceph_monc_open_session(struct ceph_mon_client *monc);
113
114extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
115
116extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
117 u32 pool, u64 *snapid);
118
119extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
120 u32 pool, u64 snapid);
121
122#endif
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
new file mode 100644
index 000000000000..a362605f9368
--- /dev/null
+++ b/include/linux/ceph/msgpool.h
@@ -0,0 +1,25 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include <linux/mempool.h>
5#include "messenger.h"
6
7/*
8 * we use memory pools for preallocating messages we may receive, to
9 * avoid unexpected OOM conditions.
10 */
11struct ceph_msgpool {
12 const char *name;
13 mempool_t *pool;
14 int front_len; /* preallocated payload size */
15};
16
17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
18 int front_len, int size, bool blocking,
19 const char *name);
20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
22 int front_len);
23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
24
25#endif
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
new file mode 100644
index 000000000000..680d3d648cac
--- /dev/null
+++ b/include/linux/ceph/msgr.h
@@ -0,0 +1,175 @@
1#ifndef CEPH_MSGR_H
2#define CEPH_MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_AUTH 0x20
54
55#define CEPH_ENTITY_TYPE_ANY 0xFF
56
57extern const char *ceph_entity_type_name(int type);
58
59/*
60 * entity_addr -- network address
61 */
62struct ceph_entity_addr {
63 __le32 type;
64 __le32 nonce; /* unique id for process (e.g. pid) */
65 struct sockaddr_storage in_addr;
66} __attribute__ ((packed));
67
68struct ceph_entity_inst {
69 struct ceph_entity_name name;
70 struct ceph_entity_addr addr;
71} __attribute__ ((packed));
72
73
74/* used by message exchange protocol */
75#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
76#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
77#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
78 incoming connection */
79#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
80 with higher cseq */
81#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
82 with higher gseq */
83#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
84#define CEPH_MSGR_TAG_MSG 7 /* message */
85#define CEPH_MSGR_TAG_ACK 8 /* message ack */
86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90
91
92/*
93 * connection negotiation
94 */
95struct ceph_msg_connect {
96 __le64 features; /* supported feature bits */
97 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
98 __le32 global_seq; /* count connections initiated by this host */
99 __le32 connect_seq; /* count connections initiated in this session */
100 __le32 protocol_version;
101 __le32 authorizer_protocol;
102 __le32 authorizer_len;
103 __u8 flags; /* CEPH_MSG_CONNECT_* */
104} __attribute__ ((packed));
105
106struct ceph_msg_connect_reply {
107 __u8 tag;
108 __le64 features; /* feature bits for this session */
109 __le32 global_seq;
110 __le32 connect_seq;
111 __le32 protocol_version;
112 __le32 authorizer_len;
113 __u8 flags;
114} __attribute__ ((packed));
115
116#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
117
118
119/*
120 * message header
121 */
122struct ceph_msg_header_old {
123 __le64 seq; /* message seq# for this session */
124 __le64 tid; /* transaction id */
125 __le16 type; /* message type */
126 __le16 priority; /* priority. higher value == higher priority */
127 __le16 version; /* version of message encoding */
128
129 __le32 front_len; /* bytes in main payload */
130 __le32 middle_len;/* bytes in middle payload */
131 __le32 data_len; /* bytes of data payload */
132 __le16 data_off; /* sender: include full offset;
133 receiver: mask against ~PAGE_MASK */
134
135 struct ceph_entity_inst src, orig_src;
136 __le32 reserved;
137 __le32 crc; /* header crc32c */
138} __attribute__ ((packed));
139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
158#define CEPH_MSG_PRIO_LOW 64
159#define CEPH_MSG_PRIO_DEFAULT 127
160#define CEPH_MSG_PRIO_HIGH 196
161#define CEPH_MSG_PRIO_HIGHEST 255
162
163/*
164 * follows data payload
165 */
166struct ceph_msg_footer {
167 __le32 front_crc, middle_crc, data_crc;
168 __u8 flags;
169} __attribute__ ((packed));
170
171#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
172#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
173
174
175#endif
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
new file mode 100644
index 000000000000..6c91fb032c39
--- /dev/null
+++ b/include/linux/ceph/osd_client.h
@@ -0,0 +1,234 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18struct ceph_pagelist;
19
20/*
21 * completion callback for async writepages
22 */
23typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
24 struct ceph_msg *);
25
26/* a given osd we're communicating with */
27struct ceph_osd {
28 atomic_t o_ref;
29 struct ceph_osd_client *o_osdc;
30 int o_osd;
31 int o_incarnation;
32 struct rb_node o_node;
33 struct ceph_connection o_con;
34 struct list_head o_requests;
35 struct list_head o_osd_lru;
36 struct ceph_authorizer *o_authorizer;
37 void *o_authorizer_buf, *o_authorizer_reply_buf;
38 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
39 unsigned long lru_ttl;
40 int o_marked_for_keepalive;
41 struct list_head o_keepalive_item;
42};
43
44/* an in-flight request */
45struct ceph_osd_request {
46 u64 r_tid; /* unique for this client */
47 struct rb_node r_node;
48 struct list_head r_req_lru_item;
49 struct list_head r_osd_item;
50 struct ceph_osd *r_osd;
51 struct ceph_pg r_pgid;
52 int r_pg_osds[CEPH_PG_MAX_SIZE];
53 int r_num_pg_osds;
54
55 struct ceph_connection *r_con_filling_msg;
56
57 struct ceph_msg *r_request, *r_reply;
58 int r_result;
59 int r_flags; /* any additional flags for the osd */
60 u32 r_sent; /* >0 if r_request is sending/sent */
61 int r_got_reply;
62
63 struct ceph_osd_client *r_osdc;
64 struct kref r_kref;
65 bool r_mempool;
66 struct completion r_completion, r_safe_completion;
67 ceph_osdc_callback_t r_callback, r_safe_callback;
68 struct ceph_eversion r_reassert_version;
69 struct list_head r_unsafe_item;
70
71 struct inode *r_inode; /* for use by callbacks */
72 void *r_priv; /* ditto */
73
74 char r_oid[40]; /* object name */
75 int r_oid_len;
76 unsigned long r_stamp; /* send OR check time */
77 bool r_resend; /* msg send failed, needs retry */
78
79 struct ceph_file_layout r_file_layout;
80 struct ceph_snap_context *r_snapc; /* snap context for writes */
81 unsigned r_num_pages; /* size of page array (follows) */
82 struct page **r_pages; /* pages for data payload */
83 int r_pages_from_pool;
84 int r_own_pages; /* if true, i own page list */
85#ifdef CONFIG_BLOCK
86 struct bio *r_bio; /* instead of pages */
87#endif
88
89 struct ceph_pagelist *r_trail; /* trailing part of the data */
90};
91
92struct ceph_osd_client {
93 struct ceph_client *client;
94
95 struct ceph_osdmap *osdmap; /* current map */
96 struct rw_semaphore map_sem;
97 struct completion map_waiters;
98 u64 last_requested_map;
99
100 struct mutex request_mutex;
101 struct rb_root osds; /* osds */
102 struct list_head osd_lru; /* idle osds */
103 u64 timeout_tid; /* tid of timeout triggering rq */
104 u64 last_tid; /* tid of last request */
105 struct rb_root requests; /* pending requests */
106 struct list_head req_lru; /* pending requests lru */
107 int num_requests;
108 struct delayed_work timeout_work;
109 struct delayed_work osds_timeout_work;
110#ifdef CONFIG_DEBUG_FS
111 struct dentry *debugfs_file;
112#endif
113
114 mempool_t *req_mempool;
115
116 struct ceph_msgpool msgpool_op;
117 struct ceph_msgpool msgpool_op_reply;
118};
119
120struct ceph_osd_req_op {
121 u16 op; /* CEPH_OSD_OP_* */
122 u32 flags; /* CEPH_OSD_FLAG_* */
123 union {
124 struct {
125 u64 offset, length;
126 u64 truncate_size;
127 u32 truncate_seq;
128 } extent;
129 struct {
130 const char *name;
131 u32 name_len;
132 const char *val;
133 u32 value_len;
134 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
135 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
136 } xattr;
137 struct {
138 const char *class_name;
139 __u8 class_len;
140 const char *method_name;
141 __u8 method_len;
142 __u8 argc;
143 const char *indata;
144 u32 indata_len;
145 } cls;
146 struct {
147 u64 cookie, count;
148 } pgls;
149 struct {
150 u64 snapid;
151 } snap;
152 };
153 u32 payload_len;
154};
155
156extern int ceph_osdc_init(struct ceph_osd_client *osdc,
157 struct ceph_client *client);
158extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
159
160extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
161 struct ceph_msg *msg);
162extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
163 struct ceph_msg *msg);
164
165extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
166 struct ceph_file_layout *layout,
167 u64 snapid,
168 u64 off, u64 *plen, u64 *bno,
169 struct ceph_osd_request *req,
170 struct ceph_osd_req_op *op);
171
172extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
173 int flags,
174 struct ceph_snap_context *snapc,
175 struct ceph_osd_req_op *ops,
176 bool use_mempool,
177 gfp_t gfp_flags,
178 struct page **pages,
179 struct bio *bio);
180
181extern void ceph_osdc_build_request(struct ceph_osd_request *req,
182 u64 off, u64 *plen,
183 struct ceph_osd_req_op *src_ops,
184 struct ceph_snap_context *snapc,
185 struct timespec *mtime,
186 const char *oid,
187 int oid_len);
188
189extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
190 struct ceph_file_layout *layout,
191 struct ceph_vino vino,
192 u64 offset, u64 *len, int op, int flags,
193 struct ceph_snap_context *snapc,
194 int do_sync, u32 truncate_seq,
195 u64 truncate_size,
196 struct timespec *mtime,
197 bool use_mempool, int num_reply);
198
199static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
200{
201 kref_get(&req->r_kref);
202}
203extern void ceph_osdc_release_request(struct kref *kref);
204static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
205{
206 kref_put(&req->r_kref, ceph_osdc_release_request);
207}
208
209extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
210 struct ceph_osd_request *req,
211 bool nofail);
212extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
213 struct ceph_osd_request *req);
214extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
215
216extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
217 struct ceph_vino vino,
218 struct ceph_file_layout *layout,
219 u64 off, u64 *plen,
220 u32 truncate_seq, u64 truncate_size,
221 struct page **pages, int nr_pages);
222
223extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
224 struct ceph_vino vino,
225 struct ceph_file_layout *layout,
226 struct ceph_snap_context *sc,
227 u64 off, u64 len,
228 u32 truncate_seq, u64 truncate_size,
229 struct timespec *mtime,
230 struct page **pages, int nr_pages,
231 int flags, int do_sync, bool nofail);
232
233#endif
234
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
new file mode 100644
index 000000000000..ba4c205cbb01
--- /dev/null
+++ b/include/linux/ceph/osdmap.h
@@ -0,0 +1,130 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include <linux/crush/crush.h>
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid);
127
128extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
129
130#endif
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
new file mode 100644
index 000000000000..9660d6b0a35d
--- /dev/null
+++ b/include/linux/ceph/pagelist.h
@@ -0,0 +1,75 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11 struct list_head free_list;
12 size_t num_pages_free;
13};
14
15struct ceph_pagelist_cursor {
16 struct ceph_pagelist *pl; /* pagelist, for error checking */
17 struct list_head *page_lru; /* page in list */
18 size_t room; /* room remaining to reset to */
19};
20
21static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
22{
23 INIT_LIST_HEAD(&pl->head);
24 pl->mapped_tail = NULL;
25 pl->length = 0;
26 pl->room = 0;
27 INIT_LIST_HEAD(&pl->free_list);
28 pl->num_pages_free = 0;
29}
30
31extern int ceph_pagelist_release(struct ceph_pagelist *pl);
32
33extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
34
35extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
36
37extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
38
39extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
40 struct ceph_pagelist_cursor *c);
41
42extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
43 struct ceph_pagelist_cursor *c);
44
45static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
46{
47 __le64 ev = cpu_to_le64(v);
48 return ceph_pagelist_append(pl, &ev, sizeof(ev));
49}
50static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
51{
52 __le32 ev = cpu_to_le32(v);
53 return ceph_pagelist_append(pl, &ev, sizeof(ev));
54}
55static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
56{
57 __le16 ev = cpu_to_le16(v);
58 return ceph_pagelist_append(pl, &ev, sizeof(ev));
59}
60static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
61{
62 return ceph_pagelist_append(pl, &v, 1);
63}
64static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
65 char *s, size_t len)
66{
67 int ret = ceph_pagelist_encode_32(pl, len);
68 if (ret)
69 return ret;
70 if (len)
71 return ceph_pagelist_append(pl, s, len);
72 return 0;
73}
74
75#endif
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
new file mode 100644
index 000000000000..6d5247f2e81b
--- /dev/null
+++ b/include/linux/ceph/rados.h
@@ -0,0 +1,405 @@
1#ifndef CEPH_RADOS_H
2#define CEPH_RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
62
63/*
64 * placement group.
65 * we encode this into one __le64.
66 */
67struct ceph_pg {
68 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */
71} __attribute__ ((packed));
72
73/*
74 * pg_pool is a set of pgs storing a pool of objects
75 *
76 * pg_num -- base number of pseudorandomly placed pgs
77 *
78 * pgp_num -- effective number when calculating pg placement. this
79 * is used for pg_num increases. new pgs result in data being "split"
80 * into new pgs. for this to proceed smoothly, new pgs are intiially
81 * colocated with their parents; that is, pgp_num doesn't increase
82 * until the new pgs have successfully split. only _then_ are the new
83 * pgs placed independently.
84 *
85 * lpg_num -- localized pg count (per device). replicas are randomly
86 * selected.
87 *
88 * lpgp_num -- as above.
89 */
90#define CEPH_PG_TYPE_REP 1
91#define CEPH_PG_TYPE_RAID4 2
92#define CEPH_PG_POOL_VERSION 2
93struct ceph_pg_pool {
94 __u8 type; /* CEPH_PG_TYPE_* */
95 __u8 size; /* number of osds in each pg */
96 __u8 crush_ruleset; /* crush placement rule */
97 __u8 object_hash; /* hash mapping object name to ps */
98 __le32 pg_num, pgp_num; /* number of pg's */
99 __le32 lpg_num, lpgp_num; /* number of localized pg's */
100 __le32 last_change; /* most recent epoch changed */
101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed));
107
108/*
109 * stable_mod func is used to control number of placement groups.
110 * similar to straight-up modulo, but produces a stable mapping as b
111 * increases over time. b is the number of bins, and bmask is the
112 * containing power of 2 minus 1.
113 *
114 * b <= bmask and bmask=(2**n)-1
115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
116 */
117static inline int ceph_stable_mod(int x, int b, int bmask)
118{
119 if ((x & bmask) < b)
120 return x & bmask;
121 else
122 return x & (bmask >> 1);
123}
124
125/*
126 * object layout - how a given object should be stored.
127 */
128struct ceph_object_layout {
129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
130 __le32 ol_stripe_unit; /* for per-object parity, if any */
131} __attribute__ ((packed));
132
133/*
134 * compound epoch+version, used by storage layer to serialize mutations
135 */
136struct ceph_eversion {
137 __le32 epoch;
138 __le64 version;
139} __attribute__ ((packed));
140
141/*
142 * osd map bits
143 */
144
145/* status bits */
146#define CEPH_OSD_EXISTS 1
147#define CEPH_OSD_UP 2
148
149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
150#define CEPH_OSD_IN 0x10000
151#define CEPH_OSD_OUT 0
152
153
154/*
155 * osd map flag bits
156 */
157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
162
163/*
164 * osd ops
165 */
166#define CEPH_OSD_OP_MODE 0xf000
167#define CEPH_OSD_OP_MODE_RD 0x1000
168#define CEPH_OSD_OP_MODE_WR 0x2000
169#define CEPH_OSD_OP_MODE_RMW 0x3000
170#define CEPH_OSD_OP_MODE_SUB 0x4000
171
172#define CEPH_OSD_OP_TYPE 0x0f00
173#define CEPH_OSD_OP_TYPE_LOCK 0x0100
174#define CEPH_OSD_OP_TYPE_DATA 0x0200
175#define CEPH_OSD_OP_TYPE_ATTR 0x0300
176#define CEPH_OSD_OP_TYPE_EXEC 0x0400
177#define CEPH_OSD_OP_TYPE_PG 0x0500
178
179enum {
180 /** data **/
181 /* read */
182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
184
185 /* fancy read */
186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
187
188 /* write */
189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
194
195 /* fancy write */
196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
200
201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
207
208 /** attrs **/
209 /* read */
210 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
213
214 /* write */
215 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
216 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
217 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
218 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
219
220 /** subop **/
221 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
222 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
223 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
224 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
225 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
226
227 /** lock **/
228 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
229 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
230 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
231 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
232 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
233 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
234
235 /** exec **/
236 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
237
238 /** pg **/
239 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
240};
241
242static inline int ceph_osd_op_type_lock(int op)
243{
244 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
245}
246static inline int ceph_osd_op_type_data(int op)
247{
248 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
249}
250static inline int ceph_osd_op_type_attr(int op)
251{
252 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
253}
254static inline int ceph_osd_op_type_exec(int op)
255{
256 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
257}
258static inline int ceph_osd_op_type_pg(int op)
259{
260 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
261}
262
263static inline int ceph_osd_op_mode_subop(int op)
264{
265 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
266}
267static inline int ceph_osd_op_mode_read(int op)
268{
269 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
270}
271static inline int ceph_osd_op_mode_modify(int op)
272{
273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
274}
275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
280#define CEPH_OSD_TMAP_HDR 'h'
281#define CEPH_OSD_TMAP_SET 's'
282#define CEPH_OSD_TMAP_RM 'r'
283
284extern const char *ceph_osd_op_name(int op);
285
286
287/*
288 * osd op flags
289 *
290 * An op may be READ, WRITE, or READ|WRITE.
291 */
292enum {
293 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
294 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
295 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
296 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
297 CEPH_OSD_FLAG_READ = 16, /* op may read */
298 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
299 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
300 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
301 CEPH_OSD_FLAG_BALANCE_READS = 256,
302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
306};
307
308enum {
309 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
310};
311
312#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
313#define EBLACKLISTED ESHUTDOWN /* blacklisted */
314
315/* xattr comparison */
316enum {
317 CEPH_OSD_CMPXATTR_OP_NOP = 0,
318 CEPH_OSD_CMPXATTR_OP_EQ = 1,
319 CEPH_OSD_CMPXATTR_OP_NE = 2,
320 CEPH_OSD_CMPXATTR_OP_GT = 3,
321 CEPH_OSD_CMPXATTR_OP_GTE = 4,
322 CEPH_OSD_CMPXATTR_OP_LT = 5,
323 CEPH_OSD_CMPXATTR_OP_LTE = 6
324};
325
326enum {
327 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
328 CEPH_OSD_CMPXATTR_MODE_U64 = 2
329};
330
331/*
332 * an individual object operation. each may be accompanied by some data
333 * payload
334 */
335struct ceph_osd_op {
336 __le16 op; /* CEPH_OSD_OP_* */
337 __le32 flags; /* CEPH_OSD_FLAG_* */
338 union {
339 struct {
340 __le64 offset, length;
341 __le64 truncate_size;
342 __le32 truncate_seq;
343 } __attribute__ ((packed)) extent;
344 struct {
345 __le32 name_len;
346 __le32 value_len;
347 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
348 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
349 } __attribute__ ((packed)) xattr;
350 struct {
351 __u8 class_len;
352 __u8 method_len;
353 __u8 argc;
354 __le32 indata_len;
355 } __attribute__ ((packed)) cls;
356 struct {
357 __le64 cookie, count;
358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
362 };
363 __le32 payload_len;
364} __attribute__ ((packed));
365
366/*
367 * osd request message header. each request may include multiple
368 * ceph_osd_op object operations.
369 */
370struct ceph_osd_request_head {
371 __le32 client_inc; /* client incarnation */
372 struct ceph_object_layout layout; /* pgid */
373 __le32 osdmap_epoch; /* client's osdmap epoch */
374
375 __le32 flags;
376
377 struct ceph_timespec mtime; /* for mutations only */
378 struct ceph_eversion reassert_version; /* if we are replaying op */
379
380 __le32 object_len; /* length of object name */
381
382 __le64 snapid; /* snapid to read */
383 __le64 snap_seq; /* writer's snap context */
384 __le32 num_snaps;
385
386 __le16 num_ops;
387 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
388} __attribute__ ((packed));
389
390struct ceph_osd_reply_head {
391 __le32 client_inc; /* client incarnation */
392 __le32 flags;
393 struct ceph_object_layout layout;
394 __le32 osdmap_epoch;
395 struct ceph_eversion reassert_version; /* for replaying uncommitted */
396
397 __le32 result; /* result code */
398
399 __le32 object_len; /* length of object name */
400 __le32 num_ops;
401 struct ceph_osd_op ops[0]; /* ops[], object */
402} __attribute__ ((packed));
403
404
405#endif
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/include/linux/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0c991023ee47..709dfb901d11 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -75,7 +75,7 @@ struct cgroup_subsys_state {
75 75
76 unsigned long flags; 76 unsigned long flags;
77 /* ID for this css, if possible */ 77 /* ID for this css, if possible */
78 struct css_id *id; 78 struct css_id __rcu *id;
79}; 79};
80 80
81/* bits in struct cgroup_subsys_state flags field */ 81/* bits in struct cgroup_subsys_state flags field */
@@ -205,7 +205,7 @@ struct cgroup {
205 struct list_head children; /* my children */ 205 struct list_head children; /* my children */
206 206
207 struct cgroup *parent; /* my parent */ 207 struct cgroup *parent; /* my parent */
208 struct dentry *dentry; /* cgroup fs entry, RCU protected */ 208 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */
209 209
210 /* Private pointers for each registered subsystem */ 210 /* Private pointers for each registered subsystem */
211 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 211 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c1a62c56a660..320d6c94ff84 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,7 +16,11 @@
16# define __release(x) __context__(x,-1) 16# define __release(x) __context__(x,-1)
17# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) 17# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0)
18# define __percpu __attribute__((noderef, address_space(3))) 18# define __percpu __attribute__((noderef, address_space(3)))
19#ifdef CONFIG_SPARSE_RCU_POINTER
20# define __rcu __attribute__((noderef, address_space(4)))
21#else
19# define __rcu 22# define __rcu
23#endif
20extern void __chk_user_ptr(const volatile void __user *); 24extern void __chk_user_ptr(const volatile void __user *);
21extern void __chk_io_ptr(const volatile void __iomem *); 25extern void __chk_io_ptr(const volatile void __iomem *);
22#else 26#else
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4d2c39573f36..4aaeab376446 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,7 +84,7 @@ struct thread_group_cred {
84 atomic_t usage; 84 atomic_t usage;
85 pid_t tgid; /* thread group process ID */ 85 pid_t tgid; /* thread group process ID */
86 spinlock_t lock; 86 spinlock_t lock;
87 struct key *session_keyring; /* keyring inherited over fork */ 87 struct key __rcu *session_keyring; /* keyring inherited over fork */
88 struct key *process_keyring; /* keyring private to this process */ 88 struct key *process_keyring; /* keyring private to this process */
89 struct rcu_head rcu; /* RCU deletion hook */ 89 struct rcu_head rcu; /* RCU deletion hook */
90}; 90};
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
new file mode 100644
index 000000000000..97e435b191f4
--- /dev/null
+++ b/include/linux/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef CEPH_CRUSH_CRUSH_H
2#define CEPH_CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h
new file mode 100644
index 000000000000..91e884230d5d
--- /dev/null
+++ b/include/linux/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef CEPH_CRUSH_HASH_H
2#define CEPH_CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
new file mode 100644
index 000000000000..c46b99c18bb0
--- /dev/null
+++ b/include/linux/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef CEPH_CRUSH_MAPPER_H
2#define CEPH_CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 29b3ce3f2a1d..2833452ea01c 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -49,7 +49,6 @@ struct task_struct;
49 49
50#ifdef CONFIG_LOCKDEP 50#ifdef CONFIG_LOCKDEP
51extern void debug_show_all_locks(void); 51extern void debug_show_all_locks(void);
52extern void __debug_show_held_locks(struct task_struct *task);
53extern void debug_show_held_locks(struct task_struct *task); 52extern void debug_show_held_locks(struct task_struct *task);
54extern void debug_check_no_locks_freed(const void *from, unsigned long len); 53extern void debug_check_no_locks_freed(const void *from, unsigned long len);
55extern void debug_check_no_locks_held(struct task_struct *task); 54extern void debug_check_no_locks_held(struct task_struct *task);
@@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void)
58{ 57{
59} 58}
60 59
61static inline void __debug_show_held_locks(struct task_struct *task)
62{
63}
64
65static inline void debug_show_held_locks(struct task_struct *task) 60static inline void debug_show_held_locks(struct task_struct *task)
66{ 61{
67} 62}
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 52c0da4bdd18..bef3cda44c4c 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -1,6 +1,8 @@
1#ifndef _DYNAMIC_DEBUG_H 1#ifndef _DYNAMIC_DEBUG_H
2#define _DYNAMIC_DEBUG_H 2#define _DYNAMIC_DEBUG_H
3 3
4#include <linux/jump_label.h>
5
4/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which 6/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
5 * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They 7 * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
6 * use independent hash functions, to reduce the chance of false positives. 8 * use independent hash functions, to reduce the chance of false positives.
@@ -22,8 +24,6 @@ struct _ddebug {
22 const char *function; 24 const char *function;
23 const char *filename; 25 const char *filename;
24 const char *format; 26 const char *format;
25 char primary_hash;
26 char secondary_hash;
27 unsigned int lineno:24; 27 unsigned int lineno:24;
28 /* 28 /*
29 * The flags field controls the behaviour at the callsite. 29 * The flags field controls the behaviour at the callsite.
@@ -33,6 +33,7 @@ struct _ddebug {
33#define _DPRINTK_FLAGS_PRINT (1<<0) /* printk() a message using the format */ 33#define _DPRINTK_FLAGS_PRINT (1<<0) /* printk() a message using the format */
34#define _DPRINTK_FLAGS_DEFAULT 0 34#define _DPRINTK_FLAGS_DEFAULT 0
35 unsigned int flags:8; 35 unsigned int flags:8;
36 char enabled;
36} __attribute__((aligned(8))); 37} __attribute__((aligned(8)));
37 38
38 39
@@ -42,33 +43,35 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
42#if defined(CONFIG_DYNAMIC_DEBUG) 43#if defined(CONFIG_DYNAMIC_DEBUG)
43extern int ddebug_remove_module(const char *mod_name); 44extern int ddebug_remove_module(const char *mod_name);
44 45
45#define __dynamic_dbg_enabled(dd) ({ \
46 int __ret = 0; \
47 if (unlikely((dynamic_debug_enabled & (1LL << DEBUG_HASH)) && \
48 (dynamic_debug_enabled2 & (1LL << DEBUG_HASH2)))) \
49 if (unlikely(dd.flags)) \
50 __ret = 1; \
51 __ret; })
52
53#define dynamic_pr_debug(fmt, ...) do { \ 46#define dynamic_pr_debug(fmt, ...) do { \
47 __label__ do_printk; \
48 __label__ out; \
54 static struct _ddebug descriptor \ 49 static struct _ddebug descriptor \
55 __used \ 50 __used \
56 __attribute__((section("__verbose"), aligned(8))) = \ 51 __attribute__((section("__verbose"), aligned(8))) = \
57 { KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH, \ 52 { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
58 DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT }; \ 53 _DPRINTK_FLAGS_DEFAULT }; \
59 if (__dynamic_dbg_enabled(descriptor)) \ 54 JUMP_LABEL(&descriptor.enabled, do_printk); \
60 printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ 55 goto out; \
56do_printk: \
57 printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
58out: ; \
61 } while (0) 59 } while (0)
62 60
63 61
64#define dynamic_dev_dbg(dev, fmt, ...) do { \ 62#define dynamic_dev_dbg(dev, fmt, ...) do { \
63 __label__ do_printk; \
64 __label__ out; \
65 static struct _ddebug descriptor \ 65 static struct _ddebug descriptor \
66 __used \ 66 __used \
67 __attribute__((section("__verbose"), aligned(8))) = \ 67 __attribute__((section("__verbose"), aligned(8))) = \
68 { KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH, \ 68 { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
69 DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT }; \ 69 _DPRINTK_FLAGS_DEFAULT }; \
70 if (__dynamic_dbg_enabled(descriptor)) \ 70 JUMP_LABEL(&descriptor.enabled, do_printk); \
71 dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \ 71 goto out; \
72do_printk: \
73 dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
74out: ; \
72 } while (0) 75 } while (0)
73 76
74#else 77#else
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 7cf92e8a4196..36c66443bdfd 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -13,6 +13,7 @@
13#define _LINUX_EDAC_H_ 13#define _LINUX_EDAC_H_
14 14
15#include <asm/atomic.h> 15#include <asm/atomic.h>
16#include <linux/sysdev.h>
16 17
17#define EDAC_OPSTATE_INVAL -1 18#define EDAC_OPSTATE_INVAL -1
18#define EDAC_OPSTATE_POLL 0 19#define EDAC_OPSTATE_POLL 0
@@ -22,9 +23,12 @@
22extern int edac_op_state; 23extern int edac_op_state;
23extern int edac_err_assert; 24extern int edac_err_assert;
24extern atomic_t edac_handlers; 25extern atomic_t edac_handlers;
26extern struct sysdev_class edac_class;
25 27
26extern int edac_handler_set(void); 28extern int edac_handler_set(void);
27extern void edac_atomic_assert_error(void); 29extern void edac_atomic_assert_error(void);
30extern struct sysdev_class *edac_get_sysfs_class(void);
31extern void edac_put_sysfs_class(void);
28 32
29static inline void opstate_init(void) 33static inline void opstate_init(void)
30{ 34{
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f59ed297b661..133c0ba25e30 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,7 +31,7 @@ struct embedded_fd_set {
31 31
32struct fdtable { 32struct fdtable {
33 unsigned int max_fds; 33 unsigned int max_fds;
34 struct file ** fd; /* current fd array */ 34 struct file __rcu **fd; /* current fd array */
35 fd_set *close_on_exec; 35 fd_set *close_on_exec;
36 fd_set *open_fds; 36 fd_set *open_fds;
37 struct rcu_head rcu; 37 struct rcu_head rcu;
@@ -46,7 +46,7 @@ struct files_struct {
46 * read mostly part 46 * read mostly part
47 */ 47 */
48 atomic_t count; 48 atomic_t count;
49 struct fdtable *fdt; 49 struct fdtable __rcu *fdt;
50 struct fdtable fdtab; 50 struct fdtable fdtab;
51 /* 51 /*
52 * written part on a separate cache line in SMP 52 * written part on a separate cache line in SMP
@@ -55,7 +55,7 @@ struct files_struct {
55 int next_fd; 55 int next_fd;
56 struct embedded_fd_set close_on_exec_init; 56 struct embedded_fd_set close_on_exec_init;
57 struct embedded_fd_set open_fds_init; 57 struct embedded_fd_set open_fds_init;
58 struct file * fd_array[NR_OPEN_DEFAULT]; 58 struct file __rcu * fd_array[NR_OPEN_DEFAULT];
59}; 59};
60 60
61#define rcu_dereference_check_fdtable(files, fdtfd) \ 61#define rcu_dereference_check_fdtable(files, fdtfd) \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b7..3168dcfb94f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1384,7 +1384,7 @@ struct super_block {
1384 * Saved mount options for lazy filesystems using 1384 * Saved mount options for lazy filesystems using
1385 * generic_show_options() 1385 * generic_show_options()
1386 */ 1386 */
1387 char *s_options; 1387 char __rcu *s_options;
1388}; 1388};
1389 1389
1390extern struct timespec current_fs_time(struct super_block *sb); 1390extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 02b8b24f8f51..8beabb958f61 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -191,8 +191,8 @@ struct ftrace_event_call {
191 unsigned int flags; 191 unsigned int flags;
192 192
193#ifdef CONFIG_PERF_EVENTS 193#ifdef CONFIG_PERF_EVENTS
194 int perf_refcount; 194 int perf_refcount;
195 struct hlist_head *perf_events; 195 struct hlist_head __percpu *perf_events;
196#endif 196#endif
197}; 197};
198 198
@@ -252,8 +252,8 @@ DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
252 252
253extern int perf_trace_init(struct perf_event *event); 253extern int perf_trace_init(struct perf_event *event);
254extern void perf_trace_destroy(struct perf_event *event); 254extern void perf_trace_destroy(struct perf_event *event);
255extern int perf_trace_enable(struct perf_event *event); 255extern int perf_trace_add(struct perf_event *event, int flags);
256extern void perf_trace_disable(struct perf_event *event); 256extern void perf_trace_del(struct perf_event *event, int flags);
257extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, 257extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
258 char *filter_str); 258 char *filter_str);
259extern void ftrace_profile_free_filter(struct perf_event *event); 259extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4d8fb0..af3f06b41dc1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter {
129struct disk_part_tbl { 129struct disk_part_tbl {
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 int len; 131 int len;
132 struct hd_struct *last_lookup; 132 struct hd_struct __rcu *last_lookup;
133 struct hd_struct *part[]; 133 struct hd_struct __rcu *part[];
134}; 134};
135 135
136struct gendisk { 136struct gendisk {
@@ -149,7 +149,7 @@ struct gendisk {
149 * non-critical accesses use RCU. Always access through 149 * non-critical accesses use RCU. Always access through
150 * helpers. 150 * helpers.
151 */ 151 */
152 struct disk_part_tbl *part_tbl; 152 struct disk_part_tbl __rcu *part_tbl;
153 struct hd_struct part0; 153 struct hd_struct part0;
154 154
155 const struct block_device_operations *fops; 155 const struct block_device_operations *fops;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d5b387669dab..96c323ac44df 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,8 @@
64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
65#define NMI_OFFSET (1UL << NMI_SHIFT) 65#define NMI_OFFSET (1UL << NMI_SHIFT)
66 66
67#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
68
67#ifndef PREEMPT_ACTIVE 69#ifndef PREEMPT_ACTIVE
68#define PREEMPT_ACTIVE_BITS 1 70#define PREEMPT_ACTIVE_BITS 1
69#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) 71#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
@@ -82,10 +84,13 @@
82/* 84/*
83 * Are we doing bottom half or hardware interrupt processing? 85 * Are we doing bottom half or hardware interrupt processing?
84 * Are we in a softirq context? Interrupt context? 86 * Are we in a softirq context? Interrupt context?
87 * in_softirq - Are we currently processing softirq or have bh disabled?
88 * in_serving_softirq - Are we currently processing softirq?
85 */ 89 */
86#define in_irq() (hardirq_count()) 90#define in_irq() (hardirq_count())
87#define in_softirq() (softirq_count()) 91#define in_softirq() (softirq_count())
88#define in_interrupt() (irq_count()) 92#define in_interrupt() (irq_count())
93#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
89 94
90/* 95/*
91 * Are we in NMI context? 96 * Are we in NMI context?
@@ -132,14 +137,16 @@ extern void synchronize_irq(unsigned int irq);
132 137
133struct task_struct; 138struct task_struct;
134 139
135#ifndef CONFIG_VIRT_CPU_ACCOUNTING 140#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
136static inline void account_system_vtime(struct task_struct *tsk) 141static inline void account_system_vtime(struct task_struct *tsk)
137{ 142{
138} 143}
144#else
145extern void account_system_vtime(struct task_struct *tsk);
139#endif 146#endif
140 147
141#if defined(CONFIG_NO_HZ) 148#if defined(CONFIG_NO_HZ)
142#if defined(CONFIG_TINY_RCU) 149#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
143extern void rcu_enter_nohz(void); 150extern void rcu_enter_nohz(void);
144extern void rcu_exit_nohz(void); 151extern void rcu_exit_nohz(void);
145 152
diff --git a/include/linux/idr.h b/include/linux/idr.h
index e968db71e33a..cdb715e58e3e 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -50,14 +50,14 @@
50 50
51struct idr_layer { 51struct idr_layer {
52 unsigned long bitmap; /* A zero bit means "space here" */ 52 unsigned long bitmap; /* A zero bit means "space here" */
53 struct idr_layer *ary[1<<IDR_BITS]; 53 struct idr_layer __rcu *ary[1<<IDR_BITS];
54 int count; /* When zero, we can release it */ 54 int count; /* When zero, we can release it */
55 int layer; /* distance from leaf */ 55 int layer; /* distance from leaf */
56 struct rcu_head rcu_head; 56 struct rcu_head rcu_head;
57}; 57};
58 58
59struct idr { 59struct idr {
60 struct idr_layer *top; 60 struct idr_layer __rcu *top;
61 struct idr_layer *id_free; 61 struct idr_layer *id_free;
62 int layers; /* only valid without concurrent changes */ 62 int layers; /* only valid without concurrent changes */
63 int id_free_cnt; 63 int id_free_cnt;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f43fa56f600..2fea6c8ef6ba 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -82,11 +82,17 @@ extern struct group_info init_groups;
82# define CAP_INIT_BSET CAP_FULL_SET 82# define CAP_INIT_BSET CAP_FULL_SET
83 83
84#ifdef CONFIG_TREE_PREEMPT_RCU 84#ifdef CONFIG_TREE_PREEMPT_RCU
85#define INIT_TASK_RCU_TREE_PREEMPT() \
86 .rcu_blocked_node = NULL,
87#else
88#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
89#endif
90#ifdef CONFIG_PREEMPT_RCU
85#define INIT_TASK_RCU_PREEMPT(tsk) \ 91#define INIT_TASK_RCU_PREEMPT(tsk) \
86 .rcu_read_lock_nesting = 0, \ 92 .rcu_read_lock_nesting = 0, \
87 .rcu_read_unlock_special = 0, \ 93 .rcu_read_unlock_special = 0, \
88 .rcu_blocked_node = NULL, \ 94 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
89 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), 95 INIT_TASK_RCU_TREE_PREEMPT()
90#else 96#else
91#define INIT_TASK_RCU_PREEMPT(tsk) 97#define INIT_TASK_RCU_PREEMPT(tsk)
92#endif 98#endif
@@ -137,8 +143,8 @@ extern struct cred init_cred;
137 .children = LIST_HEAD_INIT(tsk.children), \ 143 .children = LIST_HEAD_INIT(tsk.children), \
138 .sibling = LIST_HEAD_INIT(tsk.sibling), \ 144 .sibling = LIST_HEAD_INIT(tsk.sibling), \
139 .group_leader = &tsk, \ 145 .group_leader = &tsk, \
140 .real_cred = &init_cred, \ 146 RCU_INIT_POINTER(.real_cred, &init_cred), \
141 .cred = &init_cred, \ 147 RCU_INIT_POINTER(.cred, &init_cred), \
142 .cred_guard_mutex = \ 148 .cred_guard_mutex = \
143 __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ 149 __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \
144 .comm = "swapper", \ 150 .comm = "swapper", \
diff --git a/include/linux/input.h b/include/linux/input.h
index 896a92227bc4..d6ae1761be97 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1196,7 +1196,7 @@ struct input_dev {
1196 int (*flush)(struct input_dev *dev, struct file *file); 1196 int (*flush)(struct input_dev *dev, struct file *file);
1197 int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); 1197 int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);
1198 1198
1199 struct input_handle *grab; 1199 struct input_handle __rcu *grab;
1200 1200
1201 spinlock_t event_lock; 1201 spinlock_t event_lock;
1202 struct mutex mutex; 1202 struct mutex mutex;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 19988983aeac..414328577ced 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
18#include <asm/atomic.h> 18#include <asm/atomic.h>
19#include <asm/ptrace.h> 19#include <asm/ptrace.h>
20#include <asm/system.h> 20#include <asm/system.h>
21#include <trace/events/irq.h>
21 22
22/* 23/*
23 * These correspond to the IORESOURCE_IRQ_* defines in 24 * These correspond to the IORESOURCE_IRQ_* defines in
@@ -407,7 +408,12 @@ asmlinkage void do_softirq(void);
407asmlinkage void __do_softirq(void); 408asmlinkage void __do_softirq(void);
408extern void open_softirq(int nr, void (*action)(struct softirq_action *)); 409extern void open_softirq(int nr, void (*action)(struct softirq_action *));
409extern void softirq_init(void); 410extern void softirq_init(void);
410#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) 411static inline void __raise_softirq_irqoff(unsigned int nr)
412{
413 trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL);
414 or_softirq_pending(1UL << nr);
415}
416
411extern void raise_softirq_irqoff(unsigned int nr); 417extern void raise_softirq_irqoff(unsigned int nr);
412extern void raise_softirq(unsigned int nr); 418extern void raise_softirq(unsigned int nr);
413extern void wakeup_softirqd(void); 419extern void wakeup_softirqd(void);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 64d529133031..3e70b21884a9 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -53,7 +53,7 @@ struct io_context {
53 53
54 struct radix_tree_root radix_root; 54 struct radix_tree_root radix_root;
55 struct hlist_head cic_list; 55 struct hlist_head cic_list;
56 void *ioc_data; 56 void __rcu *ioc_data;
57}; 57};
58 58
59static inline struct io_context *ioc_task_link(struct io_context *ioc) 59static inline struct io_context *ioc_task_link(struct io_context *ioc)
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
new file mode 100644
index 000000000000..4fa09d4d0b71
--- /dev/null
+++ b/include/linux/irq_work.h
@@ -0,0 +1,20 @@
1#ifndef _LINUX_IRQ_WORK_H
2#define _LINUX_IRQ_WORK_H
3
4struct irq_work {
5 struct irq_work *next;
6 void (*func)(struct irq_work *);
7};
8
9static inline
10void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
11{
12 entry->next = NULL;
13 entry->func = func;
14}
15
16bool irq_work_queue(struct irq_work *entry);
17void irq_work_run(void);
18void irq_work_sync(struct irq_work *entry);
19
20#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
new file mode 100644
index 000000000000..b67cb180e6e9
--- /dev/null
+++ b/include/linux/jump_label.h
@@ -0,0 +1,74 @@
1#ifndef _LINUX_JUMP_LABEL_H
2#define _LINUX_JUMP_LABEL_H
3
4#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_HAVE_ARCH_JUMP_LABEL)
5# include <asm/jump_label.h>
6# define HAVE_JUMP_LABEL
7#endif
8
9enum jump_label_type {
10 JUMP_LABEL_ENABLE,
11 JUMP_LABEL_DISABLE
12};
13
14struct module;
15
16#ifdef HAVE_JUMP_LABEL
17
18extern struct jump_entry __start___jump_table[];
19extern struct jump_entry __stop___jump_table[];
20
21extern void arch_jump_label_transform(struct jump_entry *entry,
22 enum jump_label_type type);
23extern void arch_jump_label_text_poke_early(jump_label_t addr);
24extern void jump_label_update(unsigned long key, enum jump_label_type type);
25extern void jump_label_apply_nops(struct module *mod);
26extern int jump_label_text_reserved(void *start, void *end);
27
28#define jump_label_enable(key) \
29 jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE);
30
31#define jump_label_disable(key) \
32 jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE);
33
34#else
35
36#define JUMP_LABEL(key, label) \
37do { \
38 if (unlikely(*key)) \
39 goto label; \
40} while (0)
41
42#define jump_label_enable(cond_var) \
43do { \
44 *(cond_var) = 1; \
45} while (0)
46
47#define jump_label_disable(cond_var) \
48do { \
49 *(cond_var) = 0; \
50} while (0)
51
52static inline int jump_label_apply_nops(struct module *mod)
53{
54 return 0;
55}
56
57static inline int jump_label_text_reserved(void *start, void *end)
58{
59 return 0;
60}
61
62#endif
63
64#define COND_STMT(key, stmt) \
65do { \
66 __label__ jl_enabled; \
67 JUMP_LABEL(key, jl_enabled); \
68 if (0) { \
69jl_enabled: \
70 stmt; \
71 } \
72} while (0)
73
74#endif
diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h
new file mode 100644
index 000000000000..e5d012ad92c6
--- /dev/null
+++ b/include/linux/jump_label_ref.h
@@ -0,0 +1,44 @@
1#ifndef _LINUX_JUMP_LABEL_REF_H
2#define _LINUX_JUMP_LABEL_REF_H
3
4#include <linux/jump_label.h>
5#include <asm/atomic.h>
6
7#ifdef HAVE_JUMP_LABEL
8
9static inline void jump_label_inc(atomic_t *key)
10{
11 if (atomic_add_return(1, key) == 1)
12 jump_label_enable(key);
13}
14
15static inline void jump_label_dec(atomic_t *key)
16{
17 if (atomic_dec_and_test(key))
18 jump_label_disable(key);
19}
20
21#else /* !HAVE_JUMP_LABEL */
22
23static inline void jump_label_inc(atomic_t *key)
24{
25 atomic_inc(key);
26}
27
28static inline void jump_label_dec(atomic_t *key)
29{
30 atomic_dec(key);
31}
32
33#undef JUMP_LABEL
34#define JUMP_LABEL(key, label) \
35do { \
36 if (unlikely(__builtin_choose_expr( \
37 __builtin_types_compatible_p(typeof(key), atomic_t *), \
38 atomic_read((atomic_t *)(key)), *(key)))) \
39 goto label; \
40} while (0)
41
42#endif /* HAVE_JUMP_LABEL */
43
44#endif /* _LINUX_JUMP_LABEL_REF_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b0a35e6bc69..1759ba5adce8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,18 @@ extern const char linux_proc_banner[];
58 58
59#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) 59#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
60#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) 60#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
61#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) 61#define roundup(x, y) ( \
62{ \
63 typeof(y) __y = y; \
64 (((x) + (__y - 1)) / __y) * __y; \
65} \
66)
67#define rounddown(x, y) ( \
68{ \
69 typeof(x) __x = (x); \
70 __x - (__x % (y)); \
71} \
72)
62#define DIV_ROUND_CLOSEST(x, divisor)( \ 73#define DIV_ROUND_CLOSEST(x, divisor)( \
63{ \ 74{ \
64 typeof(divisor) __divisor = divisor; \ 75 typeof(divisor) __divisor = divisor; \
diff --git a/include/linux/key.h b/include/linux/key.h
index cd50dfa1d4c2..3db0adce1fda 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -178,8 +178,9 @@ struct key {
178 */ 178 */
179 union { 179 union {
180 unsigned long value; 180 unsigned long value;
181 void __rcu *rcudata;
181 void *data; 182 void *data;
182 struct keyring_list *subscriptions; 183 struct keyring_list __rcu *subscriptions;
183 } payload; 184 } payload;
184}; 185};
185 186
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c13cc48697aa..ac740b26eb10 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,7 +205,7 @@ struct kvm {
205 205
206 struct mutex irq_lock; 206 struct mutex irq_lock;
207#ifdef CONFIG_HAVE_KVM_IRQCHIP 207#ifdef CONFIG_HAVE_KVM_IRQCHIP
208 struct kvm_irq_routing_table *irq_routing; 208 struct kvm_irq_routing_table __rcu *irq_routing;
209 struct hlist_head mask_notifier_list; 209 struct hlist_head mask_notifier_list;
210 struct hlist_head irq_ack_notifier_list; 210 struct hlist_head irq_ack_notifier_list;
211#endif 211#endif
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 17d050ce7ab8..71c09b26c759 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -32,6 +32,17 @@ extern int lock_stat;
32#define MAX_LOCKDEP_SUBCLASSES 8UL 32#define MAX_LOCKDEP_SUBCLASSES 8UL
33 33
34/* 34/*
35 * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
36 * cached in the instance of lockdep_map
37 *
38 * Currently main class (subclass == 0) and signle depth subclass
39 * are cached in lockdep_map. This optimization is mainly targeting
40 * on rq->lock. double_rq_lock() acquires this highly competitive with
41 * single depth.
42 */
43#define NR_LOCKDEP_CACHING_CLASSES 2
44
45/*
35 * Lock-classes are keyed via unique addresses, by embedding the 46 * Lock-classes are keyed via unique addresses, by embedding the
36 * lockclass-key into the kernel (or module) .data section. (For 47 * lockclass-key into the kernel (or module) .data section. (For
37 * static locks we use the lock address itself as the key.) 48 * static locks we use the lock address itself as the key.)
@@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class);
138 */ 149 */
139struct lockdep_map { 150struct lockdep_map {
140 struct lock_class_key *key; 151 struct lock_class_key *key;
141 struct lock_class *class_cache; 152 struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES];
142 const char *name; 153 const char *name;
143#ifdef CONFIG_LOCK_STAT 154#ifdef CONFIG_LOCK_STAT
144 int cpu; 155 int cpu;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ee7e258627f9..cb57d657ce4d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -299,7 +299,7 @@ struct mm_struct {
299 * new_owner->mm == mm 299 * new_owner->mm == mm
300 * new_owner->alloc_lock is held 300 * new_owner->alloc_lock is held
301 */ 301 */
302 struct task_struct *owner; 302 struct task_struct __rcu *owner;
303#endif 303#endif
304 304
305#ifdef CONFIG_PROC_FS 305#ifdef CONFIG_PROC_FS
diff --git a/include/linux/module.h b/include/linux/module.h
index aace066bad8f..b29e7458b966 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -350,7 +350,10 @@ struct module
350 struct tracepoint *tracepoints; 350 struct tracepoint *tracepoints;
351 unsigned int num_tracepoints; 351 unsigned int num_tracepoints;
352#endif 352#endif
353 353#ifdef HAVE_JUMP_LABEL
354 struct jump_entry *jump_entries;
355 unsigned int num_jump_entries;
356#endif
354#ifdef CONFIG_TRACING 357#ifdef CONFIG_TRACING
355 const char **trace_bprintk_fmt_start; 358 const char **trace_bprintk_fmt_start;
356 unsigned int num_trace_bprintk_fmt; 359 unsigned int num_trace_bprintk_fmt;
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 9ed534c991b9..70cd0603911c 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -39,8 +39,9 @@ enum ctattr_type {
39 CTA_TUPLE_MASTER, 39 CTA_TUPLE_MASTER,
40 CTA_NAT_SEQ_ADJ_ORIG, 40 CTA_NAT_SEQ_ADJ_ORIG,
41 CTA_NAT_SEQ_ADJ_REPLY, 41 CTA_NAT_SEQ_ADJ_REPLY,
42 CTA_SECMARK, 42 CTA_SECMARK, /* obsolete */
43 CTA_ZONE, 43 CTA_ZONE,
44 CTA_SECCTX,
44 __CTA_MAX 45 __CTA_MAX
45}; 46};
46#define CTA_MAX (__CTA_MAX - 1) 47#define CTA_MAX (__CTA_MAX - 1)
@@ -172,4 +173,11 @@ enum ctattr_help {
172}; 173};
173#define CTA_HELP_MAX (__CTA_HELP_MAX - 1) 174#define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
174 175
176enum ctattr_secctx {
177 CTA_SECCTX_UNSPEC,
178 CTA_SECCTX_NAME,
179 __CTA_SECCTX_MAX
180};
181#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1)
182
175#endif /* _IPCONNTRACK_NETLINK_H */ 183#endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h
index 6fcd3448b186..989092bd6274 100644
--- a/include/linux/netfilter/xt_SECMARK.h
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -11,18 +11,12 @@
11 * packets are being marked for. 11 * packets are being marked for.
12 */ 12 */
13#define SECMARK_MODE_SEL 0x01 /* SELinux */ 13#define SECMARK_MODE_SEL 0x01 /* SELinux */
14#define SECMARK_SELCTX_MAX 256 14#define SECMARK_SECCTX_MAX 256
15
16struct xt_secmark_target_selinux_info {
17 __u32 selsid;
18 char selctx[SECMARK_SELCTX_MAX];
19};
20 15
21struct xt_secmark_target_info { 16struct xt_secmark_target_info {
22 __u8 mode; 17 __u8 mode;
23 union { 18 __u32 secid;
24 struct xt_secmark_target_selinux_info sel; 19 char secctx[SECMARK_SECCTX_MAX];
25 } u;
26}; 20};
27 21
28#endif /*_XT_SECMARK_H_target */ 22#endif /*_XT_SECMARK_H_target */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 508f8cf6da37..d0edf7d823ae 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,7 +185,7 @@ struct nfs_inode {
185 struct nfs4_cached_acl *nfs4_acl; 185 struct nfs4_cached_acl *nfs4_acl;
186 /* NFSv4 state */ 186 /* NFSv4 state */
187 struct list_head open_states; 187 struct list_head open_states;
188 struct nfs_delegation *delegation; 188 struct nfs_delegation __rcu *delegation;
189 fmode_t delegation_state; 189 fmode_t delegation_state;
190 struct rw_semaphore rwsem; 190 struct rw_semaphore rwsem;
191#endif /* CONFIG_NFS_V4*/ 191#endif /* CONFIG_NFS_V4*/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b2f1a4d83550..2026f9e1ceb8 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -49,28 +49,28 @@
49 49
50struct notifier_block { 50struct notifier_block {
51 int (*notifier_call)(struct notifier_block *, unsigned long, void *); 51 int (*notifier_call)(struct notifier_block *, unsigned long, void *);
52 struct notifier_block *next; 52 struct notifier_block __rcu *next;
53 int priority; 53 int priority;
54}; 54};
55 55
56struct atomic_notifier_head { 56struct atomic_notifier_head {
57 spinlock_t lock; 57 spinlock_t lock;
58 struct notifier_block *head; 58 struct notifier_block __rcu *head;
59}; 59};
60 60
61struct blocking_notifier_head { 61struct blocking_notifier_head {
62 struct rw_semaphore rwsem; 62 struct rw_semaphore rwsem;
63 struct notifier_block *head; 63 struct notifier_block __rcu *head;
64}; 64};
65 65
66struct raw_notifier_head { 66struct raw_notifier_head {
67 struct notifier_block *head; 67 struct notifier_block __rcu *head;
68}; 68};
69 69
70struct srcu_notifier_head { 70struct srcu_notifier_head {
71 struct mutex mutex; 71 struct mutex mutex;
72 struct srcu_struct srcu; 72 struct srcu_struct srcu;
73 struct notifier_block *head; 73 struct notifier_block __rcu *head;
74}; 74};
75 75
76#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ 76#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 5171639ecf0f..32fb81212fd1 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -15,6 +15,7 @@
15 15
16#include <linux/types.h> 16#include <linux/types.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/init.h>
18#include <asm/atomic.h> 19#include <asm/atomic.h>
19 20
20/* Each escaped entry is prefixed by ESCAPE_CODE 21/* Each escaped entry is prefixed by ESCAPE_CODE
@@ -185,4 +186,10 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val);
185int oprofile_add_data64(struct op_entry *entry, u64 val); 186int oprofile_add_data64(struct op_entry *entry, u64 val);
186int oprofile_write_commit(struct op_entry *entry); 187int oprofile_write_commit(struct op_entry *entry);
187 188
189#ifdef CONFIG_PERF_EVENTS
190int __init oprofile_perf_init(struct oprofile_operations *ops);
191void oprofile_perf_exit(void);
192char *op_name_from_perf_id(void);
193#endif /* CONFIG_PERF_EVENTS */
194
188#endif /* OPROFILE_H */ 195#endif /* OPROFILE_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 570fddeb0388..2615c37c8fe5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -517,6 +517,7 @@
517#define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302 517#define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302
518#define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 518#define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303
519#define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 519#define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304
520#define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603
520#define PCI_DEVICE_ID_AMD_LANCE 0x2000 521#define PCI_DEVICE_ID_AMD_LANCE 0x2000
521#define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 522#define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001
522#define PCI_DEVICE_ID_AMD_SCSI 0x2020 523#define PCI_DEVICE_ID_AMD_SCSI 0x2020
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index ce2dc655cd1d..27ef6b190ea6 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -139,6 +139,15 @@
139 __aligned(PAGE_SIZE) 139 __aligned(PAGE_SIZE)
140 140
141/* 141/*
142 * Declaration/definition used for per-CPU variables that must be read mostly.
143 */
144#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
145 DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
146
147#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
148 DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
149
150/*
142 * Intermodule exports for per-CPU variables. sparse forgets about 151 * Intermodule exports for per-CPU variables. sparse forgets about
143 * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to 152 * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
144 * noop if __CHECKER__. 153 * noop if __CHECKER__.
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 49466b13c5c6..0eb50832aa00 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -39,6 +39,15 @@
39 preempt_enable(); \ 39 preempt_enable(); \
40} while (0) 40} while (0)
41 41
42#define get_cpu_ptr(var) ({ \
43 preempt_disable(); \
44 this_cpu_ptr(var); })
45
46#define put_cpu_ptr(var) do { \
47 (void)(var); \
48 preempt_enable(); \
49} while (0)
50
42#ifdef CONFIG_SMP 51#ifdef CONFIG_SMP
43 52
44/* minimum unit size, also is the maximum supported allocation size */ 53/* minimum unit size, also is the maximum supported allocation size */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 716f99b682c1..057bf22a8323 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -486,6 +486,8 @@ struct perf_guest_info_callbacks {
486#include <linux/workqueue.h> 486#include <linux/workqueue.h>
487#include <linux/ftrace.h> 487#include <linux/ftrace.h>
488#include <linux/cpu.h> 488#include <linux/cpu.h>
489#include <linux/irq_work.h>
490#include <linux/jump_label_ref.h>
489#include <asm/atomic.h> 491#include <asm/atomic.h>
490#include <asm/local.h> 492#include <asm/local.h>
491 493
@@ -529,16 +531,22 @@ struct hw_perf_event {
529 int last_cpu; 531 int last_cpu;
530 }; 532 };
531 struct { /* software */ 533 struct { /* software */
532 s64 remaining;
533 struct hrtimer hrtimer; 534 struct hrtimer hrtimer;
534 }; 535 };
535#ifdef CONFIG_HAVE_HW_BREAKPOINT 536#ifdef CONFIG_HAVE_HW_BREAKPOINT
536 struct { /* breakpoint */ 537 struct { /* breakpoint */
537 struct arch_hw_breakpoint info; 538 struct arch_hw_breakpoint info;
538 struct list_head bp_list; 539 struct list_head bp_list;
540 /*
541 * Crufty hack to avoid the chicken and egg
542 * problem hw_breakpoint has with context
543 * creation and event initalization.
544 */
545 struct task_struct *bp_target;
539 }; 546 };
540#endif 547#endif
541 }; 548 };
549 int state;
542 local64_t prev_count; 550 local64_t prev_count;
543 u64 sample_period; 551 u64 sample_period;
544 u64 last_period; 552 u64 last_period;
@@ -550,6 +558,13 @@ struct hw_perf_event {
550#endif 558#endif
551}; 559};
552 560
561/*
562 * hw_perf_event::state flags
563 */
564#define PERF_HES_STOPPED 0x01 /* the counter is stopped */
565#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */
566#define PERF_HES_ARCH 0x04
567
553struct perf_event; 568struct perf_event;
554 569
555/* 570/*
@@ -561,36 +576,70 @@ struct perf_event;
561 * struct pmu - generic performance monitoring unit 576 * struct pmu - generic performance monitoring unit
562 */ 577 */
563struct pmu { 578struct pmu {
564 int (*enable) (struct perf_event *event); 579 struct list_head entry;
565 void (*disable) (struct perf_event *event); 580
566 int (*start) (struct perf_event *event); 581 int * __percpu pmu_disable_count;
567 void (*stop) (struct perf_event *event); 582 struct perf_cpu_context * __percpu pmu_cpu_context;
568 void (*read) (struct perf_event *event); 583 int task_ctx_nr;
569 void (*unthrottle) (struct perf_event *event); 584
585 /*
586 * Fully disable/enable this PMU, can be used to protect from the PMI
587 * as well as for lazy/batch writing of the MSRs.
588 */
589 void (*pmu_enable) (struct pmu *pmu); /* optional */
590 void (*pmu_disable) (struct pmu *pmu); /* optional */
570 591
571 /* 592 /*
572 * Group events scheduling is treated as a transaction, add group 593 * Try and initialize the event for this PMU.
573 * events as a whole and perform one schedulability test. If the test 594 * Should return -ENOENT when the @event doesn't match this PMU.
574 * fails, roll back the whole group
575 */ 595 */
596 int (*event_init) (struct perf_event *event);
597
598#define PERF_EF_START 0x01 /* start the counter when adding */
599#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
600#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
576 601
577 /* 602 /*
578 * Start the transaction, after this ->enable() doesn't need 603 * Adds/Removes a counter to/from the PMU, can be done inside
579 * to do schedulability tests. 604 * a transaction, see the ->*_txn() methods.
580 */ 605 */
581 void (*start_txn) (const struct pmu *pmu); 606 int (*add) (struct perf_event *event, int flags);
607 void (*del) (struct perf_event *event, int flags);
608
582 /* 609 /*
583 * If ->start_txn() disabled the ->enable() schedulability test 610 * Starts/Stops a counter present on the PMU. The PMI handler
611 * should stop the counter when perf_event_overflow() returns
612 * !0. ->start() will be used to continue.
613 */
614 void (*start) (struct perf_event *event, int flags);
615 void (*stop) (struct perf_event *event, int flags);
616
617 /*
618 * Updates the counter value of the event.
619 */
620 void (*read) (struct perf_event *event);
621
622 /*
623 * Group events scheduling is treated as a transaction, add
624 * group events as a whole and perform one schedulability test.
625 * If the test fails, roll back the whole group
626 *
627 * Start the transaction, after this ->add() doesn't need to
628 * do schedulability tests.
629 */
630 void (*start_txn) (struct pmu *pmu); /* optional */
631 /*
632 * If ->start_txn() disabled the ->add() schedulability test
584 * then ->commit_txn() is required to perform one. On success 633 * then ->commit_txn() is required to perform one. On success
585 * the transaction is closed. On error the transaction is kept 634 * the transaction is closed. On error the transaction is kept
586 * open until ->cancel_txn() is called. 635 * open until ->cancel_txn() is called.
587 */ 636 */
588 int (*commit_txn) (const struct pmu *pmu); 637 int (*commit_txn) (struct pmu *pmu); /* optional */
589 /* 638 /*
590 * Will cancel the transaction, assumes ->disable() is called for 639 * Will cancel the transaction, assumes ->del() is called
591 * each successfull ->enable() during the transaction. 640 * for each successfull ->add() during the transaction.
592 */ 641 */
593 void (*cancel_txn) (const struct pmu *pmu); 642 void (*cancel_txn) (struct pmu *pmu); /* optional */
594}; 643};
595 644
596/** 645/**
@@ -631,11 +680,6 @@ struct perf_buffer {
631 void *data_pages[0]; 680 void *data_pages[0];
632}; 681};
633 682
634struct perf_pending_entry {
635 struct perf_pending_entry *next;
636 void (*func)(struct perf_pending_entry *);
637};
638
639struct perf_sample_data; 683struct perf_sample_data;
640 684
641typedef void (*perf_overflow_handler_t)(struct perf_event *, int, 685typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -656,6 +700,7 @@ struct swevent_hlist {
656 700
657#define PERF_ATTACH_CONTEXT 0x01 701#define PERF_ATTACH_CONTEXT 0x01
658#define PERF_ATTACH_GROUP 0x02 702#define PERF_ATTACH_GROUP 0x02
703#define PERF_ATTACH_TASK 0x04
659 704
660/** 705/**
661 * struct perf_event - performance event kernel representation: 706 * struct perf_event - performance event kernel representation:
@@ -669,7 +714,7 @@ struct perf_event {
669 int nr_siblings; 714 int nr_siblings;
670 int group_flags; 715 int group_flags;
671 struct perf_event *group_leader; 716 struct perf_event *group_leader;
672 const struct pmu *pmu; 717 struct pmu *pmu;
673 718
674 enum perf_event_active_state state; 719 enum perf_event_active_state state;
675 unsigned int attach_state; 720 unsigned int attach_state;
@@ -743,7 +788,7 @@ struct perf_event {
743 int pending_wakeup; 788 int pending_wakeup;
744 int pending_kill; 789 int pending_kill;
745 int pending_disable; 790 int pending_disable;
746 struct perf_pending_entry pending; 791 struct irq_work pending;
747 792
748 atomic_t event_limit; 793 atomic_t event_limit;
749 794
@@ -763,12 +808,19 @@ struct perf_event {
763#endif /* CONFIG_PERF_EVENTS */ 808#endif /* CONFIG_PERF_EVENTS */
764}; 809};
765 810
811enum perf_event_context_type {
812 task_context,
813 cpu_context,
814};
815
766/** 816/**
767 * struct perf_event_context - event context structure 817 * struct perf_event_context - event context structure
768 * 818 *
769 * Used as a container for task events and CPU events as well: 819 * Used as a container for task events and CPU events as well:
770 */ 820 */
771struct perf_event_context { 821struct perf_event_context {
822 enum perf_event_context_type type;
823 struct pmu *pmu;
772 /* 824 /*
773 * Protect the states of the events in the list, 825 * Protect the states of the events in the list,
774 * nr_active, and the list: 826 * nr_active, and the list:
@@ -808,6 +860,12 @@ struct perf_event_context {
808 struct rcu_head rcu_head; 860 struct rcu_head rcu_head;
809}; 861};
810 862
863/*
864 * Number of contexts where an event can trigger:
865 * task, softirq, hardirq, nmi.
866 */
867#define PERF_NR_CONTEXTS 4
868
811/** 869/**
812 * struct perf_event_cpu_context - per cpu event context structure 870 * struct perf_event_cpu_context - per cpu event context structure
813 */ 871 */
@@ -815,18 +873,9 @@ struct perf_cpu_context {
815 struct perf_event_context ctx; 873 struct perf_event_context ctx;
816 struct perf_event_context *task_ctx; 874 struct perf_event_context *task_ctx;
817 int active_oncpu; 875 int active_oncpu;
818 int max_pertask;
819 int exclusive; 876 int exclusive;
820 struct swevent_hlist *swevent_hlist; 877 struct list_head rotation_list;
821 struct mutex hlist_mutex; 878 int jiffies_interval;
822 int hlist_refcount;
823
824 /*
825 * Recursion avoidance:
826 *
827 * task, softirq, irq, nmi context
828 */
829 int recursion[4];
830}; 879};
831 880
832struct perf_output_handle { 881struct perf_output_handle {
@@ -842,26 +891,34 @@ struct perf_output_handle {
842 891
843#ifdef CONFIG_PERF_EVENTS 892#ifdef CONFIG_PERF_EVENTS
844 893
845/* 894extern int perf_pmu_register(struct pmu *pmu);
846 * Set by architecture code: 895extern void perf_pmu_unregister(struct pmu *pmu);
847 */ 896
848extern int perf_max_events; 897extern int perf_num_counters(void);
898extern const char *perf_pmu_name(void);
899extern void __perf_event_task_sched_in(struct task_struct *task);
900extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
849 901
850extern const struct pmu *hw_perf_event_init(struct perf_event *event); 902extern atomic_t perf_task_events;
903
904static inline void perf_event_task_sched_in(struct task_struct *task)
905{
906 COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
907}
908
909static inline
910void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
911{
912 COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
913}
851 914
852extern void perf_event_task_sched_in(struct task_struct *task);
853extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
854extern void perf_event_task_tick(struct task_struct *task);
855extern int perf_event_init_task(struct task_struct *child); 915extern int perf_event_init_task(struct task_struct *child);
856extern void perf_event_exit_task(struct task_struct *child); 916extern void perf_event_exit_task(struct task_struct *child);
857extern void perf_event_free_task(struct task_struct *task); 917extern void perf_event_free_task(struct task_struct *task);
858extern void set_perf_event_pending(void); 918extern void perf_event_delayed_put(struct task_struct *task);
859extern void perf_event_do_pending(void);
860extern void perf_event_print_debug(void); 919extern void perf_event_print_debug(void);
861extern void __perf_disable(void); 920extern void perf_pmu_disable(struct pmu *pmu);
862extern bool __perf_enable(void); 921extern void perf_pmu_enable(struct pmu *pmu);
863extern void perf_disable(void);
864extern void perf_enable(void);
865extern int perf_event_task_disable(void); 922extern int perf_event_task_disable(void);
866extern int perf_event_task_enable(void); 923extern int perf_event_task_enable(void);
867extern void perf_event_update_userpage(struct perf_event *event); 924extern void perf_event_update_userpage(struct perf_event *event);
@@ -869,7 +926,7 @@ extern int perf_event_release_kernel(struct perf_event *event);
869extern struct perf_event * 926extern struct perf_event *
870perf_event_create_kernel_counter(struct perf_event_attr *attr, 927perf_event_create_kernel_counter(struct perf_event_attr *attr,
871 int cpu, 928 int cpu,
872 pid_t pid, 929 struct task_struct *task,
873 perf_overflow_handler_t callback); 930 perf_overflow_handler_t callback);
874extern u64 perf_event_read_value(struct perf_event *event, 931extern u64 perf_event_read_value(struct perf_event *event,
875 u64 *enabled, u64 *running); 932 u64 *enabled, u64 *running);
@@ -920,14 +977,7 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
920 */ 977 */
921static inline int is_software_event(struct perf_event *event) 978static inline int is_software_event(struct perf_event *event)
922{ 979{
923 switch (event->attr.type) { 980 return event->pmu->task_ctx_nr == perf_sw_context;
924 case PERF_TYPE_SOFTWARE:
925 case PERF_TYPE_TRACEPOINT:
926 /* for now the breakpoint stuff also works as software event */
927 case PERF_TYPE_BREAKPOINT:
928 return 1;
929 }
930 return 0;
931} 981}
932 982
933extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 983extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -954,18 +1004,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
954 perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); 1004 perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
955} 1005}
956 1006
957static inline void 1007static __always_inline void
958perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 1008perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
959{ 1009{
960 if (atomic_read(&perf_swevent_enabled[event_id])) { 1010 struct pt_regs hot_regs;
961 struct pt_regs hot_regs; 1011
962 1012 JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
963 if (!regs) { 1013 return;
964 perf_fetch_caller_regs(&hot_regs); 1014
965 regs = &hot_regs; 1015have_event:
966 } 1016 if (!regs) {
967 __perf_sw_event(event_id, nr, nmi, regs, addr); 1017 perf_fetch_caller_regs(&hot_regs);
1018 regs = &hot_regs;
968 } 1019 }
1020 __perf_sw_event(event_id, nr, nmi, regs, addr);
969} 1021}
970 1022
971extern void perf_event_mmap(struct vm_area_struct *vma); 1023extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -976,7 +1028,21 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
976extern void perf_event_comm(struct task_struct *tsk); 1028extern void perf_event_comm(struct task_struct *tsk);
977extern void perf_event_fork(struct task_struct *tsk); 1029extern void perf_event_fork(struct task_struct *tsk);
978 1030
979extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); 1031/* Callchains */
1032DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
1033
1034extern void perf_callchain_user(struct perf_callchain_entry *entry,
1035 struct pt_regs *regs);
1036extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
1037 struct pt_regs *regs);
1038
1039
1040static inline void
1041perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
1042{
1043 if (entry->nr < PERF_MAX_STACK_DEPTH)
1044 entry->ip[entry->nr++] = ip;
1045}
980 1046
981extern int sysctl_perf_event_paranoid; 1047extern int sysctl_perf_event_paranoid;
982extern int sysctl_perf_event_mlock; 1048extern int sysctl_perf_event_mlock;
@@ -1019,21 +1085,18 @@ extern int perf_swevent_get_recursion_context(void);
1019extern void perf_swevent_put_recursion_context(int rctx); 1085extern void perf_swevent_put_recursion_context(int rctx);
1020extern void perf_event_enable(struct perf_event *event); 1086extern void perf_event_enable(struct perf_event *event);
1021extern void perf_event_disable(struct perf_event *event); 1087extern void perf_event_disable(struct perf_event *event);
1088extern void perf_event_task_tick(void);
1022#else 1089#else
1023static inline void 1090static inline void
1024perf_event_task_sched_in(struct task_struct *task) { } 1091perf_event_task_sched_in(struct task_struct *task) { }
1025static inline void 1092static inline void
1026perf_event_task_sched_out(struct task_struct *task, 1093perf_event_task_sched_out(struct task_struct *task,
1027 struct task_struct *next) { } 1094 struct task_struct *next) { }
1028static inline void
1029perf_event_task_tick(struct task_struct *task) { }
1030static inline int perf_event_init_task(struct task_struct *child) { return 0; } 1095static inline int perf_event_init_task(struct task_struct *child) { return 0; }
1031static inline void perf_event_exit_task(struct task_struct *child) { } 1096static inline void perf_event_exit_task(struct task_struct *child) { }
1032static inline void perf_event_free_task(struct task_struct *task) { } 1097static inline void perf_event_free_task(struct task_struct *task) { }
1033static inline void perf_event_do_pending(void) { } 1098static inline void perf_event_delayed_put(struct task_struct *task) { }
1034static inline void perf_event_print_debug(void) { } 1099static inline void perf_event_print_debug(void) { }
1035static inline void perf_disable(void) { }
1036static inline void perf_enable(void) { }
1037static inline int perf_event_task_disable(void) { return -EINVAL; } 1100static inline int perf_event_task_disable(void) { return -EINVAL; }
1038static inline int perf_event_task_enable(void) { return -EINVAL; } 1101static inline int perf_event_task_enable(void) { return -EINVAL; }
1039 1102
@@ -1056,6 +1119,7 @@ static inline int perf_swevent_get_recursion_context(void) { return -1; }
1056static inline void perf_swevent_put_recursion_context(int rctx) { } 1119static inline void perf_swevent_put_recursion_context(int rctx) { }
1057static inline void perf_event_enable(struct perf_event *event) { } 1120static inline void perf_event_enable(struct perf_event *event) { }
1058static inline void perf_event_disable(struct perf_event *event) { } 1121static inline void perf_event_disable(struct perf_event *event) { }
1122static inline void perf_event_task_tick(void) { }
1059#endif 1123#endif
1060 1124
1061#define perf_output_put(handle, x) \ 1125#define perf_output_put(handle, x) \
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 634b8e674ac5..a39cbed9ee17 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr)
47{ 47{
48 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); 48 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
49} 49}
50#define radix_tree_indirect_to_ptr(ptr) \
51 radix_tree_indirect_to_ptr((void __force *)(ptr))
50 52
51static inline int radix_tree_is_indirect_ptr(void *ptr) 53static inline int radix_tree_is_indirect_ptr(void *ptr)
52{ 54{
@@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
61struct radix_tree_root { 63struct radix_tree_root {
62 unsigned int height; 64 unsigned int height;
63 gfp_t gfp_mask; 65 gfp_t gfp_mask;
64 struct radix_tree_node *rnode; 66 struct radix_tree_node __rcu *rnode;
65}; 67};
66 68
67#define RADIX_TREE_INIT(mask) { \ 69#define RADIX_TREE_INIT(mask) { \
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4ec3b38ce9c5..f31ef61f1c65 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -10,6 +10,21 @@
10#include <linux/rcupdate.h> 10#include <linux/rcupdate.h>
11 11
12/* 12/*
13 * Why is there no list_empty_rcu()? Because list_empty() serves this
14 * purpose. The list_empty() function fetches the RCU-protected pointer
15 * and compares it to the address of the list head, but neither dereferences
16 * this pointer itself nor provides this pointer to the caller. Therefore,
17 * it is not necessary to use rcu_dereference(), so that list_empty() can
18 * be used anywhere you would want to use a list_empty_rcu().
19 */
20
21/*
22 * return the ->next pointer of a list_head in an rcu safe
23 * way, we must not access it directly
24 */
25#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
26
27/*
13 * Insert a new entry between two known consecutive entries. 28 * Insert a new entry between two known consecutive entries.
14 * 29 *
15 * This is only for internal list manipulation where we know 30 * This is only for internal list manipulation where we know
@@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new,
20{ 35{
21 new->next = next; 36 new->next = next;
22 new->prev = prev; 37 new->prev = prev;
23 rcu_assign_pointer(prev->next, new); 38 rcu_assign_pointer(list_next_rcu(prev), new);
24 next->prev = new; 39 next->prev = new;
25} 40}
26 41
@@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old,
138{ 153{
139 new->next = old->next; 154 new->next = old->next;
140 new->prev = old->prev; 155 new->prev = old->prev;
141 rcu_assign_pointer(new->prev->next, new); 156 rcu_assign_pointer(list_next_rcu(new->prev), new);
142 new->next->prev = new; 157 new->next->prev = new;
143 old->prev = LIST_POISON2; 158 old->prev = LIST_POISON2;
144} 159}
@@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
193 */ 208 */
194 209
195 last->next = at; 210 last->next = at;
196 rcu_assign_pointer(head->next, first); 211 rcu_assign_pointer(list_next_rcu(head), first);
197 first->prev = head; 212 first->prev = head;
198 at->prev = last; 213 at->prev = last;
199} 214}
@@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
208 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). 223 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
209 */ 224 */
210#define list_entry_rcu(ptr, type, member) \ 225#define list_entry_rcu(ptr, type, member) \
211 container_of(rcu_dereference_raw(ptr), type, member) 226 ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \
227 container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
228 })
212 229
213/** 230/**
214 * list_first_entry_rcu - get the first element from a list 231 * list_first_entry_rcu - get the first element from a list
@@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
225 list_entry_rcu((ptr)->next, type, member) 242 list_entry_rcu((ptr)->next, type, member)
226 243
227#define __list_for_each_rcu(pos, head) \ 244#define __list_for_each_rcu(pos, head) \
228 for (pos = rcu_dereference_raw((head)->next); \ 245 for (pos = rcu_dereference_raw(list_next_rcu(head)); \
229 pos != (head); \ 246 pos != (head); \
230 pos = rcu_dereference_raw(pos->next)) 247 pos = rcu_dereference_raw(list_next_rcu((pos)))
231 248
232/** 249/**
233 * list_for_each_entry_rcu - iterate over rcu list of given type 250 * list_for_each_entry_rcu - iterate over rcu list of given type
@@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
257 * as long as the traversal is guarded by rcu_read_lock(). 274 * as long as the traversal is guarded by rcu_read_lock().
258 */ 275 */
259#define list_for_each_continue_rcu(pos, head) \ 276#define list_for_each_continue_rcu(pos, head) \
260 for ((pos) = rcu_dereference_raw((pos)->next); \ 277 for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
261 prefetch((pos)->next), (pos) != (head); \ 278 prefetch((pos)->next), (pos) != (head); \
262 (pos) = rcu_dereference_raw((pos)->next)) 279 (pos) = rcu_dereference_raw(list_next_rcu(pos)))
263 280
264/** 281/**
265 * list_for_each_entry_continue_rcu - continue iteration over list of given type 282 * list_for_each_entry_continue_rcu - continue iteration over list of given type
@@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
314 331
315 new->next = next; 332 new->next = next;
316 new->pprev = old->pprev; 333 new->pprev = old->pprev;
317 rcu_assign_pointer(*new->pprev, new); 334 rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
318 if (next) 335 if (next)
319 new->next->pprev = &new->next; 336 new->next->pprev = &new->next;
320 old->pprev = LIST_POISON2; 337 old->pprev = LIST_POISON2;
321} 338}
322 339
340/*
341 * return the first or the next element in an RCU protected hlist
342 */
343#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first)))
344#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next)))
345#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev)))
346
323/** 347/**
324 * hlist_add_head_rcu 348 * hlist_add_head_rcu
325 * @n: the element to add to the hash list. 349 * @n: the element to add to the hash list.
@@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
346 370
347 n->next = first; 371 n->next = first;
348 n->pprev = &h->first; 372 n->pprev = &h->first;
349 rcu_assign_pointer(h->first, n); 373 rcu_assign_pointer(hlist_first_rcu(h), n);
350 if (first) 374 if (first)
351 first->pprev = &n->next; 375 first->pprev = &n->next;
352} 376}
@@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
374{ 398{
375 n->pprev = next->pprev; 399 n->pprev = next->pprev;
376 n->next = next; 400 n->next = next;
377 rcu_assign_pointer(*(n->pprev), n); 401 rcu_assign_pointer(hlist_pprev_rcu(n), n);
378 next->pprev = &n->next; 402 next->pprev = &n->next;
379} 403}
380 404
@@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
401{ 425{
402 n->next = prev->next; 426 n->next = prev->next;
403 n->pprev = &prev->next; 427 n->pprev = &prev->next;
404 rcu_assign_pointer(prev->next, n); 428 rcu_assign_pointer(hlist_next_rcu(prev), n);
405 if (n->next) 429 if (n->next)
406 n->next->pprev = &n->next; 430 n->next->pprev = &n->next;
407} 431}
408 432
409#define __hlist_for_each_rcu(pos, head) \ 433#define __hlist_for_each_rcu(pos, head) \
410 for (pos = rcu_dereference((head)->first); \ 434 for (pos = rcu_dereference(hlist_first_rcu(head)); \
411 pos && ({ prefetch(pos->next); 1; }); \ 435 pos && ({ prefetch(pos->next); 1; }); \
412 pos = rcu_dereference(pos->next)) 436 pos = rcu_dereference(hlist_next_rcu(pos)))
413 437
414/** 438/**
415 * hlist_for_each_entry_rcu - iterate over rcu list of given type 439 * hlist_for_each_entry_rcu - iterate over rcu list of given type
@@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
422 * the _rcu list-mutation primitives such as hlist_add_head_rcu() 446 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
423 * as long as the traversal is guarded by rcu_read_lock(). 447 * as long as the traversal is guarded by rcu_read_lock().
424 */ 448 */
425#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ 449#define hlist_for_each_entry_rcu(tpos, pos, head, member) \
426 for (pos = rcu_dereference_raw((head)->first); \ 450 for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \
427 pos && ({ prefetch(pos->next); 1; }) && \ 451 pos && ({ prefetch(pos->next); 1; }) && \
428 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ 452 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
429 pos = rcu_dereference_raw(pos->next)) 453 pos = rcu_dereference_raw(hlist_next_rcu(pos)))
430 454
431/** 455/**
432 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type 456 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index b70ffe53cb9f..2ae13714828b 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
37 } 37 }
38} 38}
39 39
40#define hlist_nulls_first_rcu(head) \
41 (*((struct hlist_nulls_node __rcu __force **)&(head)->first))
42
43#define hlist_nulls_next_rcu(node) \
44 (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
45
40/** 46/**
41 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization 47 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
42 * @n: the element to delete from the hash list. 48 * @n: the element to delete from the hash list.
@@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
88 94
89 n->next = first; 95 n->next = first;
90 n->pprev = &h->first; 96 n->pprev = &h->first;
91 rcu_assign_pointer(h->first, n); 97 rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
92 if (!is_a_nulls(first)) 98 if (!is_a_nulls(first))
93 first->pprev = &n->next; 99 first->pprev = &n->next;
94} 100}
@@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
100 * @member: the name of the hlist_nulls_node within the struct. 106 * @member: the name of the hlist_nulls_node within the struct.
101 * 107 *
102 */ 108 */
103#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ 109#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
104 for (pos = rcu_dereference_raw((head)->first); \ 110 for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \
105 (!is_a_nulls(pos)) && \ 111 (!is_a_nulls(pos)) && \
106 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ 112 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
107 pos = rcu_dereference_raw(pos->next)) 113 pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
108 114
109#endif 115#endif
110#endif 116#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 83af1f8d8b74..03cda7bed985 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,11 +41,15 @@
41#include <linux/lockdep.h> 41#include <linux/lockdep.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/debugobjects.h> 43#include <linux/debugobjects.h>
44#include <linux/compiler.h>
44 45
45#ifdef CONFIG_RCU_TORTURE_TEST 46#ifdef CONFIG_RCU_TORTURE_TEST
46extern int rcutorture_runnable; /* for sysctl */ 47extern int rcutorture_runnable; /* for sysctl */
47#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 48#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
48 49
50#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52
49/** 53/**
50 * struct rcu_head - callback structure for use with RCU 54 * struct rcu_head - callback structure for use with RCU
51 * @next: next update requests in a list 55 * @next: next update requests in a list
@@ -57,29 +61,94 @@ struct rcu_head {
57}; 61};
58 62
59/* Exported common interfaces */ 63/* Exported common interfaces */
60extern void rcu_barrier(void); 64extern void call_rcu_sched(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu));
66extern void synchronize_sched(void);
61extern void rcu_barrier_bh(void); 67extern void rcu_barrier_bh(void);
62extern void rcu_barrier_sched(void); 68extern void rcu_barrier_sched(void);
63extern void synchronize_sched_expedited(void); 69extern void synchronize_sched_expedited(void);
64extern int sched_expedited_torture_stats(char *page); 70extern int sched_expedited_torture_stats(char *page);
65 71
72static inline void __rcu_read_lock_bh(void)
73{
74 local_bh_disable();
75}
76
77static inline void __rcu_read_unlock_bh(void)
78{
79 local_bh_enable();
80}
81
82#ifdef CONFIG_PREEMPT_RCU
83
84extern void __rcu_read_lock(void);
85extern void __rcu_read_unlock(void);
86void synchronize_rcu(void);
87
88/*
89 * Defined as a macro as it is a very low level header included from
90 * areas that don't even know about current. This gives the rcu_read_lock()
91 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
92 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
93 */
94#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
95
96#else /* #ifdef CONFIG_PREEMPT_RCU */
97
98static inline void __rcu_read_lock(void)
99{
100 preempt_disable();
101}
102
103static inline void __rcu_read_unlock(void)
104{
105 preempt_enable();
106}
107
108static inline void synchronize_rcu(void)
109{
110 synchronize_sched();
111}
112
113static inline int rcu_preempt_depth(void)
114{
115 return 0;
116}
117
118#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
119
66/* Internal to kernel */ 120/* Internal to kernel */
67extern void rcu_init(void); 121extern void rcu_init(void);
122extern void rcu_sched_qs(int cpu);
123extern void rcu_bh_qs(int cpu);
124extern void rcu_check_callbacks(int cpu, int user);
125struct notifier_block;
126
127#ifdef CONFIG_NO_HZ
128
129extern void rcu_enter_nohz(void);
130extern void rcu_exit_nohz(void);
131
132#else /* #ifdef CONFIG_NO_HZ */
133
134static inline void rcu_enter_nohz(void)
135{
136}
137
138static inline void rcu_exit_nohz(void)
139{
140}
141
142#endif /* #else #ifdef CONFIG_NO_HZ */
68 143
69#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) 144#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
70#include <linux/rcutree.h> 145#include <linux/rcutree.h>
71#elif defined(CONFIG_TINY_RCU) 146#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
72#include <linux/rcutiny.h> 147#include <linux/rcutiny.h>
73#else 148#else
74#error "Unknown RCU implementation specified to kernel configuration" 149#error "Unknown RCU implementation specified to kernel configuration"
75#endif 150#endif
76 151
77#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
78#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
79#define INIT_RCU_HEAD(ptr) do { \
80 (ptr)->next = NULL; (ptr)->func = NULL; \
81} while (0)
82
83/* 152/*
84 * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic 153 * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
85 * initialization and destruction of rcu_head on the stack. rcu_head structures 154 * initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map;
120extern int debug_lockdep_rcu_enabled(void); 189extern int debug_lockdep_rcu_enabled(void);
121 190
122/** 191/**
123 * rcu_read_lock_held - might we be in RCU read-side critical section? 192 * rcu_read_lock_held() - might we be in RCU read-side critical section?
124 * 193 *
125 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU 194 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
126 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, 195 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
127 * this assumes we are in an RCU read-side critical section unless it can 196 * this assumes we are in an RCU read-side critical section unless it can
128 * prove otherwise. 197 * prove otherwise. This is useful for debug checks in functions that
198 * require that they be called within an RCU read-side critical section.
129 * 199 *
130 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 200 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
131 * and while lockdep is disabled. 201 * and while lockdep is disabled.
132 */ 202 */
133static inline int rcu_read_lock_held(void) 203static inline int rcu_read_lock_held(void)
@@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void)
144extern int rcu_read_lock_bh_held(void); 214extern int rcu_read_lock_bh_held(void);
145 215
146/** 216/**
147 * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? 217 * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
148 * 218 *
149 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an 219 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
150 * RCU-sched read-side critical section. In absence of 220 * RCU-sched read-side critical section. In absence of
151 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side 221 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
152 * critical section unless it can prove otherwise. Note that disabling 222 * critical section unless it can prove otherwise. Note that disabling
153 * of preemption (including disabling irqs) counts as an RCU-sched 223 * of preemption (including disabling irqs) counts as an RCU-sched
154 * read-side critical section. 224 * read-side critical section. This is useful for debug checks in functions
225 * that required that they be called within an RCU-sched read-side
226 * critical section.
155 * 227 *
156 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 228 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
157 * and while lockdep is disabled. 229 * and while lockdep is disabled.
@@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void)
211 283
212extern int rcu_my_thread_group_empty(void); 284extern int rcu_my_thread_group_empty(void);
213 285
214#define __do_rcu_dereference_check(c) \ 286/**
287 * rcu_lockdep_assert - emit lockdep splat if specified condition not met
288 * @c: condition to check
289 */
290#define rcu_lockdep_assert(c) \
215 do { \ 291 do { \
216 static bool __warned; \ 292 static bool __warned; \
217 if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ 293 if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \
@@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void);
220 } \ 296 } \
221 } while (0) 297 } while (0)
222 298
299#else /* #ifdef CONFIG_PROVE_RCU */
300
301#define rcu_lockdep_assert(c) do { } while (0)
302
303#endif /* #else #ifdef CONFIG_PROVE_RCU */
304
305/*
306 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
307 * and rcu_assign_pointer(). Some of these could be folded into their
308 * callers, but they are left separate in order to ease introduction of
309 * multiple flavors of pointers to match the multiple flavors of RCU
310 * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
311 * the future.
312 */
313
314#ifdef __CHECKER__
315#define rcu_dereference_sparse(p, space) \
316 ((void)(((typeof(*p) space *)p) == p))
317#else /* #ifdef __CHECKER__ */
318#define rcu_dereference_sparse(p, space)
319#endif /* #else #ifdef __CHECKER__ */
320
321#define __rcu_access_pointer(p, space) \
322 ({ \
323 typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
324 rcu_dereference_sparse(p, space); \
325 ((typeof(*p) __force __kernel *)(_________p1)); \
326 })
327#define __rcu_dereference_check(p, c, space) \
328 ({ \
329 typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
330 rcu_lockdep_assert(c); \
331 rcu_dereference_sparse(p, space); \
332 smp_read_barrier_depends(); \
333 ((typeof(*p) __force __kernel *)(_________p1)); \
334 })
335#define __rcu_dereference_protected(p, c, space) \
336 ({ \
337 rcu_lockdep_assert(c); \
338 rcu_dereference_sparse(p, space); \
339 ((typeof(*p) __force __kernel *)(p)); \
340 })
341
342#define __rcu_dereference_index_check(p, c) \
343 ({ \
344 typeof(p) _________p1 = ACCESS_ONCE(p); \
345 rcu_lockdep_assert(c); \
346 smp_read_barrier_depends(); \
347 (_________p1); \
348 })
349#define __rcu_assign_pointer(p, v, space) \
350 ({ \
351 if (!__builtin_constant_p(v) || \
352 ((v) != NULL)) \
353 smp_wmb(); \
354 (p) = (typeof(*v) __force space *)(v); \
355 })
356
357
358/**
359 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
360 * @p: The pointer to read
361 *
362 * Return the value of the specified RCU-protected pointer, but omit the
363 * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful
364 * when the value of this pointer is accessed, but the pointer is not
365 * dereferenced, for example, when testing an RCU-protected pointer against
366 * NULL. Although rcu_access_pointer() may also be used in cases where
367 * update-side locks prevent the value of the pointer from changing, you
368 * should instead use rcu_dereference_protected() for this use case.
369 */
370#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
371
223/** 372/**
224 * rcu_dereference_check - rcu_dereference with debug checking 373 * rcu_dereference_check() - rcu_dereference with debug checking
225 * @p: The pointer to read, prior to dereferencing 374 * @p: The pointer to read, prior to dereferencing
226 * @c: The conditions under which the dereference will take place 375 * @c: The conditions under which the dereference will take place
227 * 376 *
228 * Do an rcu_dereference(), but check that the conditions under which the 377 * Do an rcu_dereference(), but check that the conditions under which the
229 * dereference will take place are correct. Typically the conditions indicate 378 * dereference will take place are correct. Typically the conditions
230 * the various locking conditions that should be held at that point. The check 379 * indicate the various locking conditions that should be held at that
231 * should return true if the conditions are satisfied. 380 * point. The check should return true if the conditions are satisfied.
381 * An implicit check for being in an RCU read-side critical section
382 * (rcu_read_lock()) is included.
232 * 383 *
233 * For example: 384 * For example:
234 * 385 *
235 * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || 386 * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
236 * lockdep_is_held(&foo->lock));
237 * 387 *
238 * could be used to indicate to lockdep that foo->bar may only be dereferenced 388 * could be used to indicate to lockdep that foo->bar may only be dereferenced
239 * if either the RCU read lock is held, or that the lock required to replace 389 * if either rcu_read_lock() is held, or that the lock required to replace
240 * the bar struct at foo->bar is held. 390 * the bar struct at foo->bar is held.
241 * 391 *
242 * Note that the list of conditions may also include indications of when a lock 392 * Note that the list of conditions may also include indications of when a lock
243 * need not be held, for example during initialisation or destruction of the 393 * need not be held, for example during initialisation or destruction of the
244 * target struct: 394 * target struct:
245 * 395 *
246 * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || 396 * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
247 * lockdep_is_held(&foo->lock) ||
248 * atomic_read(&foo->usage) == 0); 397 * atomic_read(&foo->usage) == 0);
398 *
399 * Inserts memory barriers on architectures that require them
400 * (currently only the Alpha), prevents the compiler from refetching
401 * (and from merging fetches), and, more importantly, documents exactly
402 * which pointers are protected by RCU and checks that the pointer is
403 * annotated as __rcu.
249 */ 404 */
250#define rcu_dereference_check(p, c) \ 405#define rcu_dereference_check(p, c) \
251 ({ \ 406 __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
252 __do_rcu_dereference_check(c); \ 407
253 rcu_dereference_raw(p); \ 408/**
254 }) 409 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
410 * @p: The pointer to read, prior to dereferencing
411 * @c: The conditions under which the dereference will take place
412 *
413 * This is the RCU-bh counterpart to rcu_dereference_check().
414 */
415#define rcu_dereference_bh_check(p, c) \
416 __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
255 417
256/** 418/**
257 * rcu_dereference_protected - fetch RCU pointer when updates prevented 419 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
420 * @p: The pointer to read, prior to dereferencing
421 * @c: The conditions under which the dereference will take place
422 *
423 * This is the RCU-sched counterpart to rcu_dereference_check().
424 */
425#define rcu_dereference_sched_check(p, c) \
426 __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
427 __rcu)
428
429#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
430
431/**
432 * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
433 * @p: The pointer to read, prior to dereferencing
434 * @c: The conditions under which the dereference will take place
435 *
436 * Similar to rcu_dereference_check(), but omits the sparse checking.
437 * This allows rcu_dereference_index_check() to be used on integers,
438 * which can then be used as array indices. Attempting to use
439 * rcu_dereference_check() on an integer will give compiler warnings
440 * because the sparse address-space mechanism relies on dereferencing
441 * the RCU-protected pointer. Dereferencing integers is not something
442 * that even gcc will put up with.
443 *
444 * Note that this function does not implicitly check for RCU read-side
445 * critical sections. If this function gains lots of uses, it might
446 * make sense to provide versions for each flavor of RCU, but it does
447 * not make sense as of early 2010.
448 */
449#define rcu_dereference_index_check(p, c) \
450 __rcu_dereference_index_check((p), (c))
451
452/**
453 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
454 * @p: The pointer to read, prior to dereferencing
455 * @c: The conditions under which the dereference will take place
258 * 456 *
259 * Return the value of the specified RCU-protected pointer, but omit 457 * Return the value of the specified RCU-protected pointer, but omit
260 * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This 458 * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This
@@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void);
263 * prevent the compiler from repeating this reference or combining it 461 * prevent the compiler from repeating this reference or combining it
264 * with other references, so it should not be used without protection 462 * with other references, so it should not be used without protection
265 * of appropriate locks. 463 * of appropriate locks.
464 *
465 * This function is only for update-side use. Using this function
466 * when protected only by rcu_read_lock() will result in infrequent
467 * but very ugly failures.
266 */ 468 */
267#define rcu_dereference_protected(p, c) \ 469#define rcu_dereference_protected(p, c) \
268 ({ \ 470 __rcu_dereference_protected((p), (c), __rcu)
269 __do_rcu_dereference_check(c); \
270 (p); \
271 })
272 471
273#else /* #ifdef CONFIG_PROVE_RCU */ 472/**
473 * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
474 * @p: The pointer to read, prior to dereferencing
475 * @c: The conditions under which the dereference will take place
476 *
477 * This is the RCU-bh counterpart to rcu_dereference_protected().
478 */
479#define rcu_dereference_bh_protected(p, c) \
480 __rcu_dereference_protected((p), (c), __rcu)
274 481
275#define rcu_dereference_check(p, c) rcu_dereference_raw(p) 482/**
276#define rcu_dereference_protected(p, c) (p) 483 * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
484 * @p: The pointer to read, prior to dereferencing
485 * @c: The conditions under which the dereference will take place
486 *
487 * This is the RCU-sched counterpart to rcu_dereference_protected().
488 */
489#define rcu_dereference_sched_protected(p, c) \
490 __rcu_dereference_protected((p), (c), __rcu)
277 491
278#endif /* #else #ifdef CONFIG_PROVE_RCU */
279 492
280/** 493/**
281 * rcu_access_pointer - fetch RCU pointer with no dereferencing 494 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
495 * @p: The pointer to read, prior to dereferencing
282 * 496 *
283 * Return the value of the specified RCU-protected pointer, but omit the 497 * This is a simple wrapper around rcu_dereference_check().
284 * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful 498 */
285 * when the value of this pointer is accessed, but the pointer is not 499#define rcu_dereference(p) rcu_dereference_check(p, 0)
286 * dereferenced, for example, when testing an RCU-protected pointer against 500
287 * NULL. This may also be used in cases where update-side locks prevent 501/**
288 * the value of the pointer from changing, but rcu_dereference_protected() 502 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
289 * is a lighter-weight primitive for this use case. 503 * @p: The pointer to read, prior to dereferencing
504 *
505 * Makes rcu_dereference_check() do the dirty work.
506 */
507#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
508
509/**
510 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
511 * @p: The pointer to read, prior to dereferencing
512 *
513 * Makes rcu_dereference_check() do the dirty work.
290 */ 514 */
291#define rcu_access_pointer(p) ACCESS_ONCE(p) 515#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
292 516
293/** 517/**
294 * rcu_read_lock - mark the beginning of an RCU read-side critical section. 518 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
295 * 519 *
296 * When synchronize_rcu() is invoked on one CPU while other CPUs 520 * When synchronize_rcu() is invoked on one CPU while other CPUs
297 * are within RCU read-side critical sections, then the 521 * are within RCU read-side critical sections, then the
@@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void);
302 * until after the all the other CPUs exit their critical sections. 526 * until after the all the other CPUs exit their critical sections.
303 * 527 *
304 * Note, however, that RCU callbacks are permitted to run concurrently 528 * Note, however, that RCU callbacks are permitted to run concurrently
305 * with RCU read-side critical sections. One way that this can happen 529 * with new RCU read-side critical sections. One way that this can happen
306 * is via the following sequence of events: (1) CPU 0 enters an RCU 530 * is via the following sequence of events: (1) CPU 0 enters an RCU
307 * read-side critical section, (2) CPU 1 invokes call_rcu() to register 531 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
308 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, 532 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
@@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void);
317 * will be deferred until the outermost RCU read-side critical section 541 * will be deferred until the outermost RCU read-side critical section
318 * completes. 542 * completes.
319 * 543 *
320 * It is illegal to block while in an RCU read-side critical section. 544 * You can avoid reading and understanding the next paragraph by
545 * following this rule: don't put anything in an rcu_read_lock() RCU
546 * read-side critical section that would block in a !PREEMPT kernel.
547 * But if you want the full story, read on!
548 *
549 * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
550 * is illegal to block while in an RCU read-side critical section. In
551 * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
552 * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
553 * be preempted, but explicit blocking is illegal. Finally, in preemptible
554 * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
555 * RCU read-side critical sections may be preempted and they may also
556 * block, but only when acquiring spinlocks that are subject to priority
557 * inheritance.
321 */ 558 */
322static inline void rcu_read_lock(void) 559static inline void rcu_read_lock(void)
323{ 560{
@@ -337,7 +574,7 @@ static inline void rcu_read_lock(void)
337 */ 574 */
338 575
339/** 576/**
340 * rcu_read_unlock - marks the end of an RCU read-side critical section. 577 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
341 * 578 *
342 * See rcu_read_lock() for more information. 579 * See rcu_read_lock() for more information.
343 */ 580 */
@@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void)
349} 586}
350 587
351/** 588/**
352 * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section 589 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
353 * 590 *
354 * This is equivalent of rcu_read_lock(), but to be used when updates 591 * This is equivalent of rcu_read_lock(), but to be used when updates
355 * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks 592 * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since
356 * consider completion of a softirq handler to be a quiescent state, 593 * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a
357 * a process in RCU read-side critical section must be protected by 594 * softirq handler to be a quiescent state, a process in RCU read-side
358 * disabling softirqs. Read-side critical sections in interrupt context 595 * critical section must be protected by disabling softirqs. Read-side
359 * can use just rcu_read_lock(). 596 * critical sections in interrupt context can use just rcu_read_lock(),
360 * 597 * though this should at least be commented to avoid confusing people
598 * reading the code.
361 */ 599 */
362static inline void rcu_read_lock_bh(void) 600static inline void rcu_read_lock_bh(void)
363{ 601{
@@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void)
379} 617}
380 618
381/** 619/**
382 * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section 620 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
383 * 621 *
384 * Should be used with either 622 * This is equivalent of rcu_read_lock(), but to be used when updates
385 * - synchronize_sched() 623 * are being done using call_rcu_sched() or synchronize_rcu_sched().
386 * or 624 * Read-side critical sections can also be introduced by anything that
387 * - call_rcu_sched() and rcu_barrier_sched() 625 * disables preemption, including local_irq_disable() and friends.
388 * on the write-side to insure proper synchronization.
389 */ 626 */
390static inline void rcu_read_lock_sched(void) 627static inline void rcu_read_lock_sched(void)
391{ 628{
@@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
420 preempt_enable_notrace(); 657 preempt_enable_notrace();
421} 658}
422 659
423
424/** 660/**
425 * rcu_dereference_raw - fetch an RCU-protected pointer 661 * rcu_assign_pointer() - assign to RCU-protected pointer
662 * @p: pointer to assign to
663 * @v: value to assign (publish)
426 * 664 *
427 * The caller must be within some flavor of RCU read-side critical 665 * Assigns the specified value to the specified RCU-protected
428 * section, or must be otherwise preventing the pointer from changing, 666 * pointer, ensuring that any concurrent RCU readers will see
429 * for example, by holding an appropriate lock. This pointer may later 667 * any prior initialization. Returns the value assigned.
430 * be safely dereferenced. It is the caller's responsibility to have
431 * done the right thing, as this primitive does no checking of any kind.
432 *
433 * Inserts memory barriers on architectures that require them
434 * (currently only the Alpha), and, more importantly, documents
435 * exactly which pointers are protected by RCU.
436 */
437#define rcu_dereference_raw(p) ({ \
438 typeof(p) _________p1 = ACCESS_ONCE(p); \
439 smp_read_barrier_depends(); \
440 (_________p1); \
441 })
442
443/**
444 * rcu_dereference - fetch an RCU-protected pointer, checking for RCU
445 *
446 * Makes rcu_dereference_check() do the dirty work.
447 */
448#define rcu_dereference(p) \
449 rcu_dereference_check(p, rcu_read_lock_held())
450
451/**
452 * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh
453 *
454 * Makes rcu_dereference_check() do the dirty work.
455 */
456#define rcu_dereference_bh(p) \
457 rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled())
458
459/**
460 * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched
461 *
462 * Makes rcu_dereference_check() do the dirty work.
463 */
464#define rcu_dereference_sched(p) \
465 rcu_dereference_check(p, rcu_read_lock_sched_held())
466
467/**
468 * rcu_assign_pointer - assign (publicize) a pointer to a newly
469 * initialized structure that will be dereferenced by RCU read-side
470 * critical sections. Returns the value assigned.
471 * 668 *
472 * Inserts memory barriers on architectures that require them 669 * Inserts memory barriers on architectures that require them
473 * (pretty much all of them other than x86), and also prevents 670 * (pretty much all of them other than x86), and also prevents
@@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
476 * call documents which pointers will be dereferenced by RCU read-side 673 * call documents which pointers will be dereferenced by RCU read-side
477 * code. 674 * code.
478 */ 675 */
479
480#define rcu_assign_pointer(p, v) \ 676#define rcu_assign_pointer(p, v) \
481 ({ \ 677 __rcu_assign_pointer((p), (v), __rcu)
482 if (!__builtin_constant_p(v) || \ 678
483 ((v) != NULL)) \ 679/**
484 smp_wmb(); \ 680 * RCU_INIT_POINTER() - initialize an RCU protected pointer
485 (p) = (v); \ 681 *
486 }) 682 * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
683 * splats.
684 */
685#define RCU_INIT_POINTER(p, v) \
686 p = (typeof(*v) __force __rcu *)(v)
487 687
488/* Infrastructure to implement the synchronize_() primitives. */ 688/* Infrastructure to implement the synchronize_() primitives. */
489 689
@@ -494,26 +694,37 @@ struct rcu_synchronize {
494 694
495extern void wakeme_after_rcu(struct rcu_head *head); 695extern void wakeme_after_rcu(struct rcu_head *head);
496 696
697#ifdef CONFIG_PREEMPT_RCU
698
497/** 699/**
498 * call_rcu - Queue an RCU callback for invocation after a grace period. 700 * call_rcu() - Queue an RCU callback for invocation after a grace period.
499 * @head: structure to be used for queueing the RCU updates. 701 * @head: structure to be used for queueing the RCU updates.
500 * @func: actual update function to be invoked after the grace period 702 * @func: actual callback function to be invoked after the grace period
501 * 703 *
502 * The update function will be invoked some time after a full grace 704 * The callback function will be invoked some time after a full grace
503 * period elapses, in other words after all currently executing RCU 705 * period elapses, in other words after all pre-existing RCU read-side
504 * read-side critical sections have completed. RCU read-side critical 706 * critical sections have completed. However, the callback function
707 * might well execute concurrently with RCU read-side critical sections
708 * that started after call_rcu() was invoked. RCU read-side critical
505 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 709 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
506 * and may be nested. 710 * and may be nested.
507 */ 711 */
508extern void call_rcu(struct rcu_head *head, 712extern void call_rcu(struct rcu_head *head,
509 void (*func)(struct rcu_head *head)); 713 void (*func)(struct rcu_head *head));
510 714
715#else /* #ifdef CONFIG_PREEMPT_RCU */
716
717/* In classic RCU, call_rcu() is just call_rcu_sched(). */
718#define call_rcu call_rcu_sched
719
720#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
721
511/** 722/**
512 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. 723 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
513 * @head: structure to be used for queueing the RCU updates. 724 * @head: structure to be used for queueing the RCU updates.
514 * @func: actual update function to be invoked after the grace period 725 * @func: actual callback function to be invoked after the grace period
515 * 726 *
516 * The update function will be invoked some time after a full grace 727 * The callback function will be invoked some time after a full grace
517 * period elapses, in other words after all currently executing RCU 728 * period elapses, in other words after all currently executing RCU
518 * read-side critical sections have completed. call_rcu_bh() assumes 729 * read-side critical sections have completed. call_rcu_bh() assumes
519 * that the read-side critical sections end on completion of a softirq 730 * that the read-side critical sections end on completion of a softirq
@@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
566} 777}
567#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 778#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
568 779
569#ifndef CONFIG_PROVE_RCU
570#define __do_rcu_dereference_check(c) do { } while (0)
571#endif /* #ifdef CONFIG_PROVE_RCU */
572
573#define __rcu_dereference_index_check(p, c) \
574 ({ \
575 typeof(p) _________p1 = ACCESS_ONCE(p); \
576 __do_rcu_dereference_check(c); \
577 smp_read_barrier_depends(); \
578 (_________p1); \
579 })
580
581/**
582 * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
583 * @p: The pointer to read, prior to dereferencing
584 * @c: The conditions under which the dereference will take place
585 *
586 * Similar to rcu_dereference_check(), but omits the sparse checking.
587 * This allows rcu_dereference_index_check() to be used on integers,
588 * which can then be used as array indices. Attempting to use
589 * rcu_dereference_check() on an integer will give compiler warnings
590 * because the sparse address-space mechanism relies on dereferencing
591 * the RCU-protected pointer. Dereferencing integers is not something
592 * that even gcc will put up with.
593 *
594 * Note that this function does not implicitly check for RCU read-side
595 * critical sections. If this function gains lots of uses, it might
596 * make sense to provide versions for each flavor of RCU, but it does
597 * not make sense as of early 2010.
598 */
599#define rcu_dereference_index_check(p, c) \
600 __rcu_dereference_index_check((p), (c))
601
602#endif /* __LINUX_RCUPDATE_H */ 780#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e2e893144a84..13877cb93a60 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,103 +27,101 @@
27 27
28#include <linux/cache.h> 28#include <linux/cache.h>
29 29
30void rcu_sched_qs(int cpu); 30#define rcu_init_sched() do { } while (0)
31void rcu_bh_qs(int cpu);
32static inline void rcu_note_context_switch(int cpu)
33{
34 rcu_sched_qs(cpu);
35}
36 31
37#define __rcu_read_lock() preempt_disable() 32#ifdef CONFIG_TINY_RCU
38#define __rcu_read_unlock() preempt_enable()
39#define __rcu_read_lock_bh() local_bh_disable()
40#define __rcu_read_unlock_bh() local_bh_enable()
41#define call_rcu_sched call_rcu
42 33
43#define rcu_init_sched() do { } while (0) 34static inline void synchronize_rcu_expedited(void)
44extern void rcu_check_callbacks(int cpu, int user); 35{
36 synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */
37}
45 38
46static inline int rcu_needs_cpu(int cpu) 39static inline void rcu_barrier(void)
47{ 40{
48 return 0; 41 rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */
49} 42}
50 43
51/* 44#else /* #ifdef CONFIG_TINY_RCU */
52 * Return the number of grace periods. 45
53 */ 46void rcu_barrier(void);
54static inline long rcu_batches_completed(void) 47void synchronize_rcu_expedited(void);
48
49#endif /* #else #ifdef CONFIG_TINY_RCU */
50
51static inline void synchronize_rcu_bh(void)
55{ 52{
56 return 0; 53 synchronize_sched();
57} 54}
58 55
59/* 56static inline void synchronize_rcu_bh_expedited(void)
60 * Return the number of bottom-half grace periods.
61 */
62static inline long rcu_batches_completed_bh(void)
63{ 57{
64 return 0; 58 synchronize_sched();
65} 59}
66 60
67static inline void rcu_force_quiescent_state(void) 61#ifdef CONFIG_TINY_RCU
62
63static inline void rcu_preempt_note_context_switch(void)
68{ 64{
69} 65}
70 66
71static inline void rcu_bh_force_quiescent_state(void) 67static inline void exit_rcu(void)
72{ 68{
73} 69}
74 70
75static inline void rcu_sched_force_quiescent_state(void) 71static inline int rcu_needs_cpu(int cpu)
76{ 72{
73 return 0;
77} 74}
78 75
79extern void synchronize_sched(void); 76#else /* #ifdef CONFIG_TINY_RCU */
77
78void rcu_preempt_note_context_switch(void);
79extern void exit_rcu(void);
80int rcu_preempt_needs_cpu(void);
80 81
81static inline void synchronize_rcu(void) 82static inline int rcu_needs_cpu(int cpu)
82{ 83{
83 synchronize_sched(); 84 return rcu_preempt_needs_cpu();
84} 85}
85 86
86static inline void synchronize_rcu_bh(void) 87#endif /* #else #ifdef CONFIG_TINY_RCU */
88
89static inline void rcu_note_context_switch(int cpu)
87{ 90{
88 synchronize_sched(); 91 rcu_sched_qs(cpu);
92 rcu_preempt_note_context_switch();
89} 93}
90 94
91static inline void synchronize_rcu_expedited(void) 95/*
96 * Return the number of grace periods.
97 */
98static inline long rcu_batches_completed(void)
92{ 99{
93 synchronize_sched(); 100 return 0;
94} 101}
95 102
96static inline void synchronize_rcu_bh_expedited(void) 103/*
104 * Return the number of bottom-half grace periods.
105 */
106static inline long rcu_batches_completed_bh(void)
97{ 107{
98 synchronize_sched(); 108 return 0;
99} 109}
100 110
101struct notifier_block; 111static inline void rcu_force_quiescent_state(void)
102
103#ifdef CONFIG_NO_HZ
104
105extern void rcu_enter_nohz(void);
106extern void rcu_exit_nohz(void);
107
108#else /* #ifdef CONFIG_NO_HZ */
109
110static inline void rcu_enter_nohz(void)
111{ 112{
112} 113}
113 114
114static inline void rcu_exit_nohz(void) 115static inline void rcu_bh_force_quiescent_state(void)
115{ 116{
116} 117}
117 118
118#endif /* #else #ifdef CONFIG_NO_HZ */ 119static inline void rcu_sched_force_quiescent_state(void)
119
120static inline void exit_rcu(void)
121{ 120{
122} 121}
123 122
124static inline int rcu_preempt_depth(void) 123static inline void rcu_cpu_stall_reset(void)
125{ 124{
126 return 0;
127} 125}
128 126
129#ifdef CONFIG_DEBUG_LOCK_ALLOC 127#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c0ed1c056f29..95518e628794 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,64 +30,23 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33struct notifier_block;
34
35extern void rcu_sched_qs(int cpu);
36extern void rcu_bh_qs(int cpu);
37extern void rcu_note_context_switch(int cpu); 33extern void rcu_note_context_switch(int cpu);
38extern int rcu_needs_cpu(int cpu); 34extern int rcu_needs_cpu(int cpu);
35extern void rcu_cpu_stall_reset(void);
39 36
40#ifdef CONFIG_TREE_PREEMPT_RCU 37#ifdef CONFIG_TREE_PREEMPT_RCU
41 38
42extern void __rcu_read_lock(void);
43extern void __rcu_read_unlock(void);
44extern void synchronize_rcu(void);
45extern void exit_rcu(void); 39extern void exit_rcu(void);
46 40
47/*
48 * Defined as macro as it is a very low level header
49 * included from areas that don't even know about current
50 */
51#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
52
53#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 41#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
54 42
55static inline void __rcu_read_lock(void)
56{
57 preempt_disable();
58}
59
60static inline void __rcu_read_unlock(void)
61{
62 preempt_enable();
63}
64
65#define synchronize_rcu synchronize_sched
66
67static inline void exit_rcu(void) 43static inline void exit_rcu(void)
68{ 44{
69} 45}
70 46
71static inline int rcu_preempt_depth(void)
72{
73 return 0;
74}
75
76#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 47#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
77 48
78static inline void __rcu_read_lock_bh(void)
79{
80 local_bh_disable();
81}
82static inline void __rcu_read_unlock_bh(void)
83{
84 local_bh_enable();
85}
86
87extern void call_rcu_sched(struct rcu_head *head,
88 void (*func)(struct rcu_head *rcu));
89extern void synchronize_rcu_bh(void); 49extern void synchronize_rcu_bh(void);
90extern void synchronize_sched(void);
91extern void synchronize_rcu_expedited(void); 50extern void synchronize_rcu_expedited(void);
92 51
93static inline void synchronize_rcu_bh_expedited(void) 52static inline void synchronize_rcu_bh_expedited(void)
@@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void)
95 synchronize_sched_expedited(); 54 synchronize_sched_expedited();
96} 55}
97 56
98extern void rcu_check_callbacks(int cpu, int user); 57extern void rcu_barrier(void);
99 58
100extern long rcu_batches_completed(void); 59extern long rcu_batches_completed(void);
101extern long rcu_batches_completed_bh(void); 60extern long rcu_batches_completed_bh(void);
@@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void);
104extern void rcu_bh_force_quiescent_state(void); 63extern void rcu_bh_force_quiescent_state(void);
105extern void rcu_sched_force_quiescent_state(void); 64extern void rcu_sched_force_quiescent_state(void);
106 65
107#ifdef CONFIG_NO_HZ
108void rcu_enter_nohz(void);
109void rcu_exit_nohz(void);
110#else /* CONFIG_NO_HZ */
111static inline void rcu_enter_nohz(void)
112{
113}
114static inline void rcu_exit_nohz(void)
115{
116}
117#endif /* CONFIG_NO_HZ */
118
119/* A context switch is a grace period for RCU-sched and RCU-bh. */ 66/* A context switch is a grace period for RCU-sched and RCU-bh. */
120static inline int rcu_blocking_is_gp(void) 67static inline int rcu_blocking_is_gp(void)
121{ 68{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7dd..0383601a927c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ enum sched_domain_level {
875 SD_LV_NONE = 0, 875 SD_LV_NONE = 0,
876 SD_LV_SIBLING, 876 SD_LV_SIBLING,
877 SD_LV_MC, 877 SD_LV_MC,
878 SD_LV_BOOK,
878 SD_LV_CPU, 879 SD_LV_CPU,
879 SD_LV_NODE, 880 SD_LV_NODE,
880 SD_LV_ALLNODES, 881 SD_LV_ALLNODES,
@@ -1160,6 +1161,13 @@ struct sched_rt_entity {
1160 1161
1161struct rcu_node; 1162struct rcu_node;
1162 1163
1164enum perf_event_task_context {
1165 perf_invalid_context = -1,
1166 perf_hw_context = 0,
1167 perf_sw_context,
1168 perf_nr_task_contexts,
1169};
1170
1163struct task_struct { 1171struct task_struct {
1164 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1172 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1165 void *stack; 1173 void *stack;
@@ -1202,11 +1210,13 @@ struct task_struct {
1202 unsigned int policy; 1210 unsigned int policy;
1203 cpumask_t cpus_allowed; 1211 cpumask_t cpus_allowed;
1204 1212
1205#ifdef CONFIG_TREE_PREEMPT_RCU 1213#ifdef CONFIG_PREEMPT_RCU
1206 int rcu_read_lock_nesting; 1214 int rcu_read_lock_nesting;
1207 char rcu_read_unlock_special; 1215 char rcu_read_unlock_special;
1208 struct rcu_node *rcu_blocked_node;
1209 struct list_head rcu_node_entry; 1216 struct list_head rcu_node_entry;
1217#endif /* #ifdef CONFIG_PREEMPT_RCU */
1218#ifdef CONFIG_TREE_PREEMPT_RCU
1219 struct rcu_node *rcu_blocked_node;
1210#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1211 1221
1212#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1222#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1288,9 +1298,9 @@ struct task_struct {
1288 struct list_head cpu_timers[3]; 1298 struct list_head cpu_timers[3];
1289 1299
1290/* process credentials */ 1300/* process credentials */
1291 const struct cred *real_cred; /* objective and real subjective task 1301 const struct cred __rcu *real_cred; /* objective and real subjective task
1292 * credentials (COW) */ 1302 * credentials (COW) */
1293 const struct cred *cred; /* effective (overridable) subjective task 1303 const struct cred __rcu *cred; /* effective (overridable) subjective task
1294 * credentials (COW) */ 1304 * credentials (COW) */
1295 struct mutex cred_guard_mutex; /* guard against foreign influences on 1305 struct mutex cred_guard_mutex; /* guard against foreign influences on
1296 * credential calculations 1306 * credential calculations
@@ -1418,7 +1428,7 @@ struct task_struct {
1418#endif 1428#endif
1419#ifdef CONFIG_CGROUPS 1429#ifdef CONFIG_CGROUPS
1420 /* Control Group info protected by css_set_lock */ 1430 /* Control Group info protected by css_set_lock */
1421 struct css_set *cgroups; 1431 struct css_set __rcu *cgroups;
1422 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1432 /* cg_list protected by css_set_lock and tsk->alloc_lock */
1423 struct list_head cg_list; 1433 struct list_head cg_list;
1424#endif 1434#endif
@@ -1431,7 +1441,7 @@ struct task_struct {
1431 struct futex_pi_state *pi_state_cache; 1441 struct futex_pi_state *pi_state_cache;
1432#endif 1442#endif
1433#ifdef CONFIG_PERF_EVENTS 1443#ifdef CONFIG_PERF_EVENTS
1434 struct perf_event_context *perf_event_ctxp; 1444 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
1435 struct mutex perf_event_mutex; 1445 struct mutex perf_event_mutex;
1436 struct list_head perf_event_list; 1446 struct list_head perf_event_list;
1437#endif 1447#endif
@@ -1681,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1681/* 1691/*
1682 * Per process flags 1692 * Per process flags
1683 */ 1693 */
1684#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ 1694#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
1685 /* Not implemented yet, only for 486*/
1686#define PF_STARTING 0x00000002 /* being created */ 1695#define PF_STARTING 0x00000002 /* being created */
1687#define PF_EXITING 0x00000004 /* getting shut down */ 1696#define PF_EXITING 0x00000004 /* getting shut down */
1688#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1697#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
@@ -1740,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1740#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1749#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1741#define used_math() tsk_used_math(current) 1750#define used_math() tsk_used_math(current)
1742 1751
1743#ifdef CONFIG_TREE_PREEMPT_RCU 1752#ifdef CONFIG_PREEMPT_RCU
1744 1753
1745#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ 1754#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1746#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ 1755#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@ -1749,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p)
1749{ 1758{
1750 p->rcu_read_lock_nesting = 0; 1759 p->rcu_read_lock_nesting = 0;
1751 p->rcu_read_unlock_special = 0; 1760 p->rcu_read_unlock_special = 0;
1761#ifdef CONFIG_TREE_PREEMPT_RCU
1752 p->rcu_blocked_node = NULL; 1762 p->rcu_blocked_node = NULL;
1763#endif
1753 INIT_LIST_HEAD(&p->rcu_node_entry); 1764 INIT_LIST_HEAD(&p->rcu_node_entry);
1754} 1765}
1755 1766
@@ -1826,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
1826extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1837extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1827#endif 1838#endif
1828 1839
1840#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1841/*
1842 * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
1843 * The reason for this explicit opt-in is not to have perf penalty with
1844 * slow sched_clocks.
1845 */
1846extern void enable_sched_clock_irqtime(void);
1847extern void disable_sched_clock_irqtime(void);
1848#else
1849static inline void enable_sched_clock_irqtime(void) {}
1850static inline void disable_sched_clock_irqtime(void) {}
1851#endif
1852
1829extern unsigned long long 1853extern unsigned long long
1830task_sched_runtime(struct task_struct *task); 1854task_sched_runtime(struct task_struct *task);
1831extern unsigned long long thread_group_sched_runtime(struct task_struct *task); 1855extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@ -2367,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
2367 2391
2368extern int __cond_resched_softirq(void); 2392extern int __cond_resched_softirq(void);
2369 2393
2370#define cond_resched_softirq() ({ \ 2394#define cond_resched_softirq() ({ \
2371 __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ 2395 __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
2372 __cond_resched_softirq(); \ 2396 __cond_resched_softirq(); \
2373}) 2397})
2374 2398
2375/* 2399/*
diff --git a/include/linux/security.h b/include/linux/security.h
index a22219afff09..b8246a8df7d2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot,
74extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); 74extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
75extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, 75extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
76 unsigned long arg4, unsigned long arg5); 76 unsigned long arg4, unsigned long arg5);
77extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); 77extern int cap_task_setscheduler(struct task_struct *p);
78extern int cap_task_setioprio(struct task_struct *p, int ioprio); 78extern int cap_task_setioprio(struct task_struct *p, int ioprio);
79extern int cap_task_setnice(struct task_struct *p, int nice); 79extern int cap_task_setnice(struct task_struct *p, int nice);
80extern int cap_syslog(int type, bool from_file); 80extern int cap_syslog(int type, bool from_file);
@@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
959 * Sets the new child socket's sid to the openreq sid. 959 * Sets the new child socket's sid to the openreq sid.
960 * @inet_conn_established: 960 * @inet_conn_established:
961 * Sets the connection's peersid to the secmark on skb. 961 * Sets the connection's peersid to the secmark on skb.
962 * @secmark_relabel_packet:
963 * check if the process should be allowed to relabel packets to the given secid
964 * @security_secmark_refcount_inc
965 * tells the LSM to increment the number of secmark labeling rules loaded
966 * @security_secmark_refcount_dec
967 * tells the LSM to decrement the number of secmark labeling rules loaded
962 * @req_classify_flow: 968 * @req_classify_flow:
963 * Sets the flow's sid to the openreq sid. 969 * Sets the flow's sid to the openreq sid.
964 * @tun_dev_create: 970 * @tun_dev_create:
@@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
1279 * Return 0 if permission is granted. 1285 * Return 0 if permission is granted.
1280 * 1286 *
1281 * @secid_to_secctx: 1287 * @secid_to_secctx:
1282 * Convert secid to security context. 1288 * Convert secid to security context. If secdata is NULL the length of
1289 * the result will be returned in seclen, but no secdata will be returned.
1290 * This does mean that the length could change between calls to check the
1291 * length and the next call which actually allocates and returns the secdata.
1283 * @secid contains the security ID. 1292 * @secid contains the security ID.
1284 * @secdata contains the pointer that stores the converted security context. 1293 * @secdata contains the pointer that stores the converted security context.
1294 * @seclen pointer which contains the length of the data
1285 * @secctx_to_secid: 1295 * @secctx_to_secid:
1286 * Convert security context to secid. 1296 * Convert security context to secid.
1287 * @secid contains the pointer to the generated security ID. 1297 * @secid contains the pointer to the generated security ID.
@@ -1501,8 +1511,7 @@ struct security_operations {
1501 int (*task_getioprio) (struct task_struct *p); 1511 int (*task_getioprio) (struct task_struct *p);
1502 int (*task_setrlimit) (struct task_struct *p, unsigned int resource, 1512 int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
1503 struct rlimit *new_rlim); 1513 struct rlimit *new_rlim);
1504 int (*task_setscheduler) (struct task_struct *p, int policy, 1514 int (*task_setscheduler) (struct task_struct *p);
1505 struct sched_param *lp);
1506 int (*task_getscheduler) (struct task_struct *p); 1515 int (*task_getscheduler) (struct task_struct *p);
1507 int (*task_movememory) (struct task_struct *p); 1516 int (*task_movememory) (struct task_struct *p);
1508 int (*task_kill) (struct task_struct *p, 1517 int (*task_kill) (struct task_struct *p,
@@ -1594,6 +1603,9 @@ struct security_operations {
1594 struct request_sock *req); 1603 struct request_sock *req);
1595 void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); 1604 void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req);
1596 void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); 1605 void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb);
1606 int (*secmark_relabel_packet) (u32 secid);
1607 void (*secmark_refcount_inc) (void);
1608 void (*secmark_refcount_dec) (void);
1597 void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); 1609 void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
1598 int (*tun_dev_create)(void); 1610 int (*tun_dev_create)(void);
1599 void (*tun_dev_post_create)(struct sock *sk); 1611 void (*tun_dev_post_create)(struct sock *sk);
@@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio);
1752int security_task_getioprio(struct task_struct *p); 1764int security_task_getioprio(struct task_struct *p);
1753int security_task_setrlimit(struct task_struct *p, unsigned int resource, 1765int security_task_setrlimit(struct task_struct *p, unsigned int resource,
1754 struct rlimit *new_rlim); 1766 struct rlimit *new_rlim);
1755int security_task_setscheduler(struct task_struct *p, 1767int security_task_setscheduler(struct task_struct *p);
1756 int policy, struct sched_param *lp);
1757int security_task_getscheduler(struct task_struct *p); 1768int security_task_getscheduler(struct task_struct *p);
1758int security_task_movememory(struct task_struct *p); 1769int security_task_movememory(struct task_struct *p);
1759int security_task_kill(struct task_struct *p, struct siginfo *info, 1770int security_task_kill(struct task_struct *p, struct siginfo *info,
@@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p,
2320 return 0; 2331 return 0;
2321} 2332}
2322 2333
2323static inline int security_task_setscheduler(struct task_struct *p, 2334static inline int security_task_setscheduler(struct task_struct *p)
2324 int policy,
2325 struct sched_param *lp)
2326{ 2335{
2327 return cap_task_setscheduler(p, policy, lp); 2336 return cap_task_setscheduler(p);
2328} 2337}
2329 2338
2330static inline int security_task_getscheduler(struct task_struct *p) 2339static inline int security_task_getscheduler(struct task_struct *p)
@@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk,
2551 const struct request_sock *req); 2560 const struct request_sock *req);
2552void security_inet_conn_established(struct sock *sk, 2561void security_inet_conn_established(struct sock *sk,
2553 struct sk_buff *skb); 2562 struct sk_buff *skb);
2563int security_secmark_relabel_packet(u32 secid);
2564void security_secmark_refcount_inc(void);
2565void security_secmark_refcount_dec(void);
2554int security_tun_dev_create(void); 2566int security_tun_dev_create(void);
2555void security_tun_dev_post_create(struct sock *sk); 2567void security_tun_dev_post_create(struct sock *sk);
2556int security_tun_dev_attach(struct sock *sk); 2568int security_tun_dev_attach(struct sock *sk);
@@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk,
2705{ 2717{
2706} 2718}
2707 2719
2720static inline int security_secmark_relabel_packet(u32 secid)
2721{
2722 return 0;
2723}
2724
2725static inline void security_secmark_refcount_inc(void)
2726{
2727}
2728
2729static inline void security_secmark_refcount_dec(void)
2730{
2731}
2732
2708static inline int security_tun_dev_create(void) 2733static inline int security_tun_dev_create(void)
2709{ 2734{
2710 return 0; 2735 return 0;
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 82e0f26a1299..44f459612690 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -21,74 +21,11 @@ struct kern_ipc_perm;
21#ifdef CONFIG_SECURITY_SELINUX 21#ifdef CONFIG_SECURITY_SELINUX
22 22
23/** 23/**
24 * selinux_string_to_sid - map a security context string to a security ID
25 * @str: the security context string to be mapped
26 * @sid: ID value returned via this.
27 *
28 * Returns 0 if successful, with the SID stored in sid. A value
29 * of zero for sid indicates no SID could be determined (but no error
30 * occurred).
31 */
32int selinux_string_to_sid(char *str, u32 *sid);
33
34/**
35 * selinux_secmark_relabel_packet_permission - secmark permission check
36 * @sid: SECMARK ID value to be applied to network packet
37 *
38 * Returns 0 if the current task is allowed to set the SECMARK label of
39 * packets with the supplied security ID. Note that it is implicit that
40 * the packet is always being relabeled from the default unlabeled value,
41 * and that the access control decision is made in the AVC.
42 */
43int selinux_secmark_relabel_packet_permission(u32 sid);
44
45/**
46 * selinux_secmark_refcount_inc - increments the secmark use counter
47 *
48 * SELinux keeps track of the current SECMARK targets in use so it knows
49 * when to apply SECMARK label access checks to network packets. This
50 * function incements this reference count to indicate that a new SECMARK
51 * target has been configured.
52 */
53void selinux_secmark_refcount_inc(void);
54
55/**
56 * selinux_secmark_refcount_dec - decrements the secmark use counter
57 *
58 * SELinux keeps track of the current SECMARK targets in use so it knows
59 * when to apply SECMARK label access checks to network packets. This
60 * function decements this reference count to indicate that one of the
61 * existing SECMARK targets has been removed/flushed.
62 */
63void selinux_secmark_refcount_dec(void);
64
65/**
66 * selinux_is_enabled - is SELinux enabled? 24 * selinux_is_enabled - is SELinux enabled?
67 */ 25 */
68bool selinux_is_enabled(void); 26bool selinux_is_enabled(void);
69#else 27#else
70 28
71static inline int selinux_string_to_sid(const char *str, u32 *sid)
72{
73 *sid = 0;
74 return 0;
75}
76
77static inline int selinux_secmark_relabel_packet_permission(u32 sid)
78{
79 return 0;
80}
81
82static inline void selinux_secmark_refcount_inc(void)
83{
84 return;
85}
86
87static inline void selinux_secmark_refcount_dec(void)
88{
89 return;
90}
91
92static inline bool selinux_is_enabled(void) 29static inline bool selinux_is_enabled(void)
93{ 30{
94 return false; 31 return false;
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4d5d2f546dbf..58971e891f48 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
108#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 108#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
109 109
110/** 110/**
111 * srcu_dereference - fetch SRCU-protected pointer with checking 111 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
112 * @p: the pointer to fetch and protect for later dereferencing
113 * @sp: pointer to the srcu_struct, which is used to check that we
114 * really are in an SRCU read-side critical section.
115 * @c: condition to check for update-side use
112 * 116 *
113 * Makes rcu_dereference_check() do the dirty work. 117 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
118 * critical section will result in an RCU-lockdep splat, unless @c evaluates
119 * to 1. The @c argument will normally be a logical expression containing
120 * lockdep_is_held() calls.
114 */ 121 */
115#define srcu_dereference(p, sp) \ 122#define srcu_dereference_check(p, sp, c) \
116 rcu_dereference_check(p, srcu_read_lock_held(sp)) 123 __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
124
125/**
126 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
127 * @p: the pointer to fetch and protect for later dereferencing
128 * @sp: pointer to the srcu_struct, which is used to check that we
129 * really are in an SRCU read-side critical section.
130 *
131 * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU
132 * is enabled, invoking this outside of an RCU read-side critical
133 * section will result in an RCU-lockdep splat.
134 */
135#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
117 136
118/** 137/**
119 * srcu_read_lock - register a new reader for an SRCU-protected structure. 138 * srcu_read_lock - register a new reader for an SRCU-protected structure.
120 * @sp: srcu_struct in which to register the new reader. 139 * @sp: srcu_struct in which to register the new reader.
121 * 140 *
122 * Enter an SRCU read-side critical section. Note that SRCU read-side 141 * Enter an SRCU read-side critical section. Note that SRCU read-side
123 * critical sections may be nested. 142 * critical sections may be nested. However, it is illegal to
143 * call anything that waits on an SRCU grace period for the same
144 * srcu_struct, whether directly or indirectly. Please note that
145 * one way to indirectly wait on an SRCU grace period is to acquire
146 * a mutex that is held elsewhere while calling synchronize_srcu() or
147 * synchronize_srcu_expedited().
124 */ 148 */
125static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) 149static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
126{ 150{
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 6b524a0d02e4..1808960c5059 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -126,8 +126,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
126 126
127#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ 127#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */
128 128
129static inline int stop_machine(int (*fn)(void *), void *data, 129static inline int __stop_machine(int (*fn)(void *), void *data,
130 const struct cpumask *cpus) 130 const struct cpumask *cpus)
131{ 131{
132 int ret; 132 int ret;
133 local_irq_disable(); 133 local_irq_disable();
@@ -136,5 +136,11 @@ static inline int stop_machine(int (*fn)(void *), void *data,
136 return ret; 136 return ret;
137} 137}
138 138
139static inline int stop_machine(int (*fn)(void *), void *data,
140 const struct cpumask *cpus)
141{
142 return __stop_machine(fn, data, cpus);
143}
144
139#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ 145#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
140#endif /* _LINUX_STOP_MACHINE */ 146#endif /* _LINUX_STOP_MACHINE */
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 671538d25bc1..8eee9dbbfe7a 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,7 +69,7 @@ struct gss_cl_ctx {
69 enum rpc_gss_proc gc_proc; 69 enum rpc_gss_proc gc_proc;
70 u32 gc_seq; 70 u32 gc_seq;
71 spinlock_t gc_seq_lock; 71 spinlock_t gc_seq_lock;
72 struct gss_ctx *gc_gss_ctx; 72 struct gss_ctx __rcu *gc_gss_ctx;
73 struct xdr_netobj gc_wire_ctx; 73 struct xdr_netobj gc_wire_ctx;
74 u32 gc_win; 74 u32 gc_win;
75 unsigned long gc_expiry; 75 unsigned long gc_expiry;
@@ -80,7 +80,7 @@ struct gss_upcall_msg;
80struct gss_cred { 80struct gss_cred {
81 struct rpc_cred gc_base; 81 struct rpc_cred gc_base;
82 enum rpc_gss_svc gc_service; 82 enum rpc_gss_svc gc_service;
83 struct gss_cl_ctx *gc_ctx; 83 struct gss_cl_ctx __rcu *gc_ctx;
84 struct gss_upcall_msg *gc_upcall; 84 struct gss_upcall_msg *gc_upcall;
85 unsigned long gc_upcall_timestamp; 85 unsigned long gc_upcall_timestamp;
86 unsigned char gc_machine_cred : 1; 86 unsigned char gc_machine_cred : 1;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index a8cc4e13434c..c90696544176 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -23,12 +23,12 @@ struct restart_block {
23 }; 23 };
24 /* For futex_wait and futex_wait_requeue_pi */ 24 /* For futex_wait and futex_wait_requeue_pi */
25 struct { 25 struct {
26 u32 *uaddr; 26 u32 __user *uaddr;
27 u32 val; 27 u32 val;
28 u32 flags; 28 u32 flags;
29 u32 bitset; 29 u32 bitset;
30 u64 time; 30 u64 time;
31 u32 *uaddr2; 31 u32 __user *uaddr2;
32 } futex; 32 } futex;
33 /* For nanosleep */ 33 /* For nanosleep */
34 struct { 34 struct {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 64e084ff5e5c..b91a40e847d2 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
201 .balance_interval = 64, \ 201 .balance_interval = 64, \
202} 202}
203 203
204#ifdef CONFIG_SCHED_BOOK
205#ifndef SD_BOOK_INIT
206#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
207#endif
208#endif /* CONFIG_SCHED_BOOK */
209
204#ifdef CONFIG_NUMA 210#ifdef CONFIG_NUMA
205#ifndef SD_NODE_INIT 211#ifndef SD_NODE_INIT
206#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 212#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 103d1b61aacb..a4a90b6726ce 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/types.h> 18#include <linux/types.h>
19#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
20#include <linux/jump_label.h>
20 21
21struct module; 22struct module;
22struct tracepoint; 23struct tracepoint;
@@ -145,7 +146,9 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
145 extern struct tracepoint __tracepoint_##name; \ 146 extern struct tracepoint __tracepoint_##name; \
146 static inline void trace_##name(proto) \ 147 static inline void trace_##name(proto) \
147 { \ 148 { \
148 if (unlikely(__tracepoint_##name.state)) \ 149 JUMP_LABEL(&__tracepoint_##name.state, do_trace); \
150 return; \
151do_trace: \
149 __DO_TRACE(&__tracepoint_##name, \ 152 __DO_TRACE(&__tracepoint_##name, \
150 TP_PROTO(data_proto), \ 153 TP_PROTO(data_proto), \
151 TP_ARGS(data_args)); \ 154 TP_ARGS(data_args)); \
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index ef6c24a529e1..a4dc5b027bd9 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
51 return 0; 51 return 0;
52 52
53 rcu_read_lock(); 53 rcu_read_lock();
54 id = rcu_dereference(net_cls_subsys_id); 54 id = rcu_dereference_index_check(net_cls_subsys_id,
55 rcu_read_lock_held());
55 if (id >= 0) 56 if (id >= 0)
56 classid = container_of(task_subsys_state(p, id), 57 classid = container_of(task_subsys_state(p, id),
57 struct cgroup_cls_state, css)->classid; 58 struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e624dae54fa4..caf17db87dbc 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -75,7 +75,7 @@ struct nf_conntrack_helper;
75/* nf_conn feature for connections that have a helper */ 75/* nf_conn feature for connections that have a helper */
76struct nf_conn_help { 76struct nf_conn_help {
77 /* Helper. if any */ 77 /* Helper. if any */
78 struct nf_conntrack_helper *helper; 78 struct nf_conntrack_helper __rcu *helper;
79 79
80 union nf_conntrack_help help; 80 union nf_conntrack_help help;
81 81
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 0e4cfb694fe7..6fa7cbab7d93 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -5,7 +5,9 @@
5#define _TRACE_IRQ_H 5#define _TRACE_IRQ_H
6 6
7#include <linux/tracepoint.h> 7#include <linux/tracepoint.h>
8#include <linux/interrupt.h> 8
9struct irqaction;
10struct softirq_action;
9 11
10#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } 12#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
11#define show_softirq_name(val) \ 13#define show_softirq_name(val) \
@@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq,
93 ), 95 ),
94 96
95 TP_fast_assign( 97 TP_fast_assign(
96 __entry->vec = (int)(h - vec); 98 if (vec)
99 __entry->vec = (int)(h - vec);
100 else
101 __entry->vec = (int)(long)h;
97 ), 102 ),
98 103
99 TP_printk("vec=%d [action=%s]", __entry->vec, 104 TP_printk("vec=%d [action=%s]", __entry->vec,
@@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit,
136 TP_ARGS(h, vec) 141 TP_ARGS(h, vec)
137); 142);
138 143
144/**
145 * softirq_raise - called immediately when a softirq is raised
146 * @h: pointer to struct softirq_action
147 * @vec: pointer to first struct softirq_action in softirq_vec array
148 *
149 * The @h parameter contains a pointer to the softirq vector number which is
150 * raised. @vec is NULL and it means @h includes vector number not
151 * softirq_action. When used in combination with the softirq_entry tracepoint
152 * we can determine the softirq raise latency.
153 */
154DEFINE_EVENT(softirq, softirq_raise,
155
156 TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
157
158 TP_ARGS(h, vec)
159);
160
139#endif /* _TRACE_IRQ_H */ 161#endif /* _TRACE_IRQ_H */
140 162
141/* This part must be outside protection */ 163/* This part must be outside protection */
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index 188deca2f3c7..8fe1e93f531d 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -6,10 +6,31 @@
6 6
7#include <linux/netdevice.h> 7#include <linux/netdevice.h>
8#include <linux/tracepoint.h> 8#include <linux/tracepoint.h>
9#include <linux/ftrace.h>
10
11#define NO_DEV "(no_device)"
12
13TRACE_EVENT(napi_poll,
9 14
10DECLARE_TRACE(napi_poll,
11 TP_PROTO(struct napi_struct *napi), 15 TP_PROTO(struct napi_struct *napi),
12 TP_ARGS(napi)); 16
17 TP_ARGS(napi),
18
19 TP_STRUCT__entry(
20 __field( struct napi_struct *, napi)
21 __string( dev_name, napi->dev ? napi->dev->name : NO_DEV)
22 ),
23
24 TP_fast_assign(
25 __entry->napi = napi;
26 __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
27 ),
28
29 TP_printk("napi poll on napi struct %p for device %s",
30 __entry->napi, __get_str(dev_name))
31);
32
33#undef NO_DEV
13 34
14#endif /* _TRACE_NAPI_H_ */ 35#endif /* _TRACE_NAPI_H_ */
15 36
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
new file mode 100644
index 000000000000..5f247f5ffc56
--- /dev/null
+++ b/include/trace/events/net.h
@@ -0,0 +1,82 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM net
3
4#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_NET_H
6
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <linux/ip.h>
10#include <linux/tracepoint.h>
11
12TRACE_EVENT(net_dev_xmit,
13
14 TP_PROTO(struct sk_buff *skb,
15 int rc),
16
17 TP_ARGS(skb, rc),
18
19 TP_STRUCT__entry(
20 __field( void *, skbaddr )
21 __field( unsigned int, len )
22 __field( int, rc )
23 __string( name, skb->dev->name )
24 ),
25
26 TP_fast_assign(
27 __entry->skbaddr = skb;
28 __entry->len = skb->len;
29 __entry->rc = rc;
30 __assign_str(name, skb->dev->name);
31 ),
32
33 TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
34 __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
35);
36
37DECLARE_EVENT_CLASS(net_dev_template,
38
39 TP_PROTO(struct sk_buff *skb),
40
41 TP_ARGS(skb),
42
43 TP_STRUCT__entry(
44 __field( void *, skbaddr )
45 __field( unsigned int, len )
46 __string( name, skb->dev->name )
47 ),
48
49 TP_fast_assign(
50 __entry->skbaddr = skb;
51 __entry->len = skb->len;
52 __assign_str(name, skb->dev->name);
53 ),
54
55 TP_printk("dev=%s skbaddr=%p len=%u",
56 __get_str(name), __entry->skbaddr, __entry->len)
57)
58
59DEFINE_EVENT(net_dev_template, net_dev_queue,
60
61 TP_PROTO(struct sk_buff *skb),
62
63 TP_ARGS(skb)
64);
65
66DEFINE_EVENT(net_dev_template, netif_receive_skb,
67
68 TP_PROTO(struct sk_buff *skb),
69
70 TP_ARGS(skb)
71);
72
73DEFINE_EVENT(net_dev_template, netif_rx,
74
75 TP_PROTO(struct sk_buff *skb),
76
77 TP_ARGS(skb)
78);
79#endif /* _TRACE_NET_H */
80
81/* This part must be outside protection */
82#include <trace/define_trace.h>
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 35a2a6e7bf1e..286784d69b8f 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -10,12 +10,17 @@
10#ifndef _TRACE_POWER_ENUM_ 10#ifndef _TRACE_POWER_ENUM_
11#define _TRACE_POWER_ENUM_ 11#define _TRACE_POWER_ENUM_
12enum { 12enum {
13 POWER_NONE = 0, 13 POWER_NONE = 0,
14 POWER_CSTATE = 1, 14 POWER_CSTATE = 1, /* C-State */
15 POWER_PSTATE = 2, 15 POWER_PSTATE = 2, /* Fequency change or DVFS */
16 POWER_SSTATE = 3, /* Suspend */
16}; 17};
17#endif 18#endif
18 19
20/*
21 * The power events are used for cpuidle & suspend (power_start, power_end)
22 * and for cpufreq (power_frequency)
23 */
19DECLARE_EVENT_CLASS(power, 24DECLARE_EVENT_CLASS(power,
20 25
21 TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id), 26 TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
@@ -70,6 +75,85 @@ TRACE_EVENT(power_end,
70 75
71); 76);
72 77
78/*
79 * The clock events are used for clock enable/disable and for
80 * clock rate change
81 */
82DECLARE_EVENT_CLASS(clock,
83
84 TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
85
86 TP_ARGS(name, state, cpu_id),
87
88 TP_STRUCT__entry(
89 __string( name, name )
90 __field( u64, state )
91 __field( u64, cpu_id )
92 ),
93
94 TP_fast_assign(
95 __assign_str(name, name);
96 __entry->state = state;
97 __entry->cpu_id = cpu_id;
98 ),
99
100 TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
101 (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
102);
103
104DEFINE_EVENT(clock, clock_enable,
105
106 TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
107
108 TP_ARGS(name, state, cpu_id)
109);
110
111DEFINE_EVENT(clock, clock_disable,
112
113 TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
114
115 TP_ARGS(name, state, cpu_id)
116);
117
118DEFINE_EVENT(clock, clock_set_rate,
119
120 TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
121
122 TP_ARGS(name, state, cpu_id)
123);
124
125/*
126 * The power domain events are used for power domains transitions
127 */
128DECLARE_EVENT_CLASS(power_domain,
129
130 TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
131
132 TP_ARGS(name, state, cpu_id),
133
134 TP_STRUCT__entry(
135 __string( name, name )
136 __field( u64, state )
137 __field( u64, cpu_id )
138 ),
139
140 TP_fast_assign(
141 __assign_str(name, name);
142 __entry->state = state;
143 __entry->cpu_id = cpu_id;
144),
145
146 TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
147 (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
148);
149
150DEFINE_EVENT(power_domain, power_domain_target,
151
152 TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
153
154 TP_ARGS(name, state, cpu_id)
155);
156
73#endif /* _TRACE_POWER_H */ 157#endif /* _TRACE_POWER_H */
74 158
75/* This part must be outside protection */ 159/* This part must be outside protection */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9208c92aeab5..f6334782a593 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
362 (unsigned long long)__entry->vruntime) 362 (unsigned long long)__entry->vruntime)
363); 363);
364 364
365/*
366 * Tracepoint for showing priority inheritance modifying a tasks
367 * priority.
368 */
369TRACE_EVENT(sched_pi_setprio,
370
371 TP_PROTO(struct task_struct *tsk, int newprio),
372
373 TP_ARGS(tsk, newprio),
374
375 TP_STRUCT__entry(
376 __array( char, comm, TASK_COMM_LEN )
377 __field( pid_t, pid )
378 __field( int, oldprio )
379 __field( int, newprio )
380 ),
381
382 TP_fast_assign(
383 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
384 __entry->pid = tsk->pid;
385 __entry->oldprio = tsk->prio;
386 __entry->newprio = newprio;
387 ),
388
389 TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
390 __entry->comm, __entry->pid,
391 __entry->oldprio, __entry->newprio)
392);
393
365#endif /* _TRACE_SCHED_H */ 394#endif /* _TRACE_SCHED_H */
366 395
367/* This part must be outside protection */ 396/* This part must be outside protection */
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 4b2be6dc76f0..75ce9d500d8e 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb,
35 __entry->skbaddr, __entry->protocol, __entry->location) 35 __entry->skbaddr, __entry->protocol, __entry->location)
36); 36);
37 37
38TRACE_EVENT(consume_skb,
39
40 TP_PROTO(struct sk_buff *skb),
41
42 TP_ARGS(skb),
43
44 TP_STRUCT__entry(
45 __field( void *, skbaddr )
46 ),
47
48 TP_fast_assign(
49 __entry->skbaddr = skb;
50 ),
51
52 TP_printk("skbaddr=%p", __entry->skbaddr)
53);
54
38TRACE_EVENT(skb_copy_datagram_iovec, 55TRACE_EVENT(skb_copy_datagram_iovec,
39 56
40 TP_PROTO(const struct sk_buff *skb, int len), 57 TP_PROTO(const struct sk_buff *skb, int len),