1 files changed, 307 insertions, 0 deletions
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
new file mode 100644
index 000000000000..bda6ead69bd0
--- /dev/null
+++ b/Documentation/RCU/listRCU.txt
@@ -0,0 +1,307 @@
+Using RCU to Protect Read-Mostly Linked Lists
+One of the best applications of RCU is to protect read-mostly linked lists
+("struct list_head" in list.h).  One big advantage of this approach
+is that all of the required memory barriers are included for you in
+the list macros.  This document describes several applications of RCU,
+with the best fits first.
+Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
+The best applications are cases where, if reader-writer locking were
+used, the read-side lock would be dropped before taking any action
+based on the results of the search.  The most celebrated example is
+the routing table.  Because the routing table is tracking the state of
+equipment outside of the computer, it will at times contain stale data.
+Therefore, once the route has been computed, there is no need to hold
+the routing table static during transmission of the packet.  After all,
+you can hold the routing table static all you want, but that won't keep
+the external Internet from changing, and it is the state of the external
+Internet that really matters.  In addition, routing entries are typically
+added or deleted, rather than being modified in place.
+A straightforward example of this use of RCU may be found in the
+system-call auditing support.  For example, a reader-writer locked
+implementation of audit_filter_task() might be as follows:
+        static enum audit_state audit_filter_task(struct task_struct *tsk)
+        {
+                struct audit_entry *e;
+                enum audit_state   state;
+                read_lock(&auditsc_lock);
+                list_for_each_entry(e, &audit_tsklist, list) {
+                        if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+                                read_unlock(&auditsc_lock);
+                                return state;
+                        }
+                }
+                read_unlock(&auditsc_lock);
+                return AUDIT_BUILD_CONTEXT;
+        }
+Here the list is searched under the lock, but the lock is dropped before
+the corresponding value is returned.  By the time that this value is acted
+on, the list may well have been modified.  This makes sense, since if
+you are turning auditing off, it is OK to audit a few extra system calls.
+This means that RCU can be easily applied to the read side, as follows:
+        static enum audit_state audit_filter_task(struct task_struct *tsk)
+        {
+                struct audit_entry *e;
+                enum audit_state   state;
+                rcu_read_lock();
+                list_for_each_entry_rcu(e, &audit_tsklist, list) {
+                        if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+                                rcu_read_unlock();
+                                return state;
+                        }
+                }
+                rcu_read_unlock();
+                return AUDIT_BUILD_CONTEXT;
+        }
+The read_lock() and read_unlock() calls have become rcu_read_lock()
+and rcu_read_unlock(), respectively, and the list_for_each_entry() has
+become list_for_each_entry_rcu().  The _rcu() list-traversal primitives
+insert the read-side memory barriers that are required on DEC Alpha CPUs.
+The changes to the update side are also straightforward.  A reader-writer
+lock might be used as follows for deletion and insertion:
+        static inline int audit_del_rule(struct audit_rule *rule,
+                                         struct list_head *list)
+        {
+                struct audit_entry  *e;
+                write_lock(&auditsc_lock);
+                list_for_each_entry(e, list, list) {
+                        if (!audit_compare_rule(rule, &e->rule)) {
+                                list_del(&e->list);
+                                write_unlock(&auditsc_lock);
+                                return 0;
+                        }
+                }
+                write_unlock(&auditsc_lock);
+                return -EFAULT;         /* No matching rule */
+        }
+        static inline int audit_add_rule(struct audit_entry *entry,
+                                         struct list_head *list)
+        {
+                write_lock(&auditsc_lock);
+                if (entry->rule.flags & AUDIT_PREPEND) {
+                        entry->rule.flags &= ~AUDIT_PREPEND;
+                        list_add(&entry->list, list);
+                } else {
+                        list_add_tail(&entry->list, list);
+                }
+                write_unlock(&auditsc_lock);
+                return 0;
+        }
+Following are the RCU equivalents for these two functions:
+        static inline int audit_del_rule(struct audit_rule *rule,
+                                         struct list_head *list)
+        {
+                struct audit_entry  *e;
+                /* Do not use the _rcu iterator here, since this is the only
+                 * deletion routine. */
+                list_for_each_entry(e, list, list) {
+                        if (!audit_compare_rule(rule, &e->rule)) {
+                                list_del_rcu(&e->list);
+                                call_rcu(&e->rcu, audit_free_rule, e);
+                                return 0;
+                        }
+                }
+                return -EFAULT;         /* No matching rule */
+        }
+        static inline int audit_add_rule(struct audit_entry *entry,
+                                         struct list_head *list)
+        {
+                if (entry->rule.flags & AUDIT_PREPEND) {
+                        entry->rule.flags &= ~AUDIT_PREPEND;
+                        list_add_rcu(&entry->list, list);
+                } else {
+                        list_add_tail_rcu(&entry->list, list);
+                }
+                return 0;
+        }
+Normally, the write_lock() and write_unlock() would be replaced by
+a spin_lock() and a spin_unlock(), but in this case, all callers hold
+audit_netlink_sem, so no additional locking is required.  The auditsc_lock
+can therefore be eliminated, since use of RCU eliminates the need for
+writers to exclude readers.
+The list_del(), list_add(), and list_add_tail() primitives have been
+replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
+The _rcu() list-manipulation primitives add memory barriers that are
+needed on weakly ordered CPUs (most of them!).
+So, when readers can tolerate stale data and when entries are either added
+or deleted, without in-place modification, it is very easy to use RCU!
+Example 2: Handling In-Place Updates
+The system-call auditing code does not update auditing rules in place.
+However, if it did, reader-writer-locked code to do so might look as
+follows (presumably, the field_count is only permitted to decrease,
+otherwise, the added fields would need to be filled in):
+        static inline int audit_upd_rule(struct audit_rule *rule,
+                                         struct list_head *list,
+                                         __u32 newaction,
+                                         __u32 newfield_count)
+        {
+                struct audit_entry  *e;
+                struct audit_newentry *ne;
+                write_lock(&auditsc_lock);
+                list_for_each_entry(e, list, list) {
+                        if (!audit_compare_rule(rule, &e->rule)) {
+                                e->rule.action = newaction;
+                                e->rule.file_count = newfield_count;
+                                write_unlock(&auditsc_lock);
+                                return 0;
+                        }
+                }
+                write_unlock(&auditsc_lock);
+                return -EFAULT;         /* No matching rule */
+        }
+The RCU version creates a copy, updates the copy, then replaces the old
+entry with the newly updated entry.  This sequence of actions, allowing
+concurrent reads while doing a copy to perform an update, is what gives
+RCU ("read-copy update") its name.  The RCU code is as follows:
+        static inline int audit_upd_rule(struct audit_rule *rule,
+                                         struct list_head *list,
+                                         __u32 newaction,
+                                         __u32 newfield_count)
+        {
+                struct audit_entry  *e;
+                struct audit_newentry *ne;
+                list_for_each_entry(e, list, list) {
+                        if (!audit_compare_rule(rule, &e->rule)) {
+                                ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
+                                if (ne == NULL)
+                                        return -ENOMEM;
+                                audit_copy_rule(&ne->rule, &e->rule);
+                                ne->rule.action = newaction;
+                                ne->rule.file_count = newfield_count;
+                                list_add_rcu(ne, e);
+                                list_del(e);
+                                call_rcu(&e->rcu, audit_free_rule, e);
+                                return 0;
+                        }
+                }
+                return -EFAULT;         /* No matching rule */
+        }
+Again, this assumes that the caller holds audit_netlink_sem.  Normally,
+the reader-writer lock would become a spinlock in this sort of code.
+Example 3: Eliminating Stale Data
+The auditing examples above tolerate stale data, as do most algorithms
+that are tracking external state.  Because there is a delay from the
+time the external state changes before Linux becomes aware of the change,
+additional RCU-induced staleness is normally not a problem.
+However, there are many examples where stale data cannot be tolerated.
+One example in the Linux kernel is the System V IPC (see the ipc_lock()
+function in ipc/util.c).  This code checks a "deleted" flag under a
+per-entry spinlock, and, if the "deleted" flag is set, pretends that the
+entry does not exist.  For this to be helpful, the search function must
+return holding the per-entry spinlock, as ipc_lock() does in fact do.
+Quick Quiz:  Why does the search function need to return holding the
+per-entry lock for this deleted-flag technique to be helpful?
+If the system-call audit module were to ever need to reject stale data,
+one way to accomplish this would be to add a "deleted" flag and a "lock"
+spinlock to the audit_entry structure, and modify audit_filter_task()
+as follows:
+        static enum audit_state audit_filter_task(struct task_struct *tsk)
+        {
+                struct audit_entry *e;
+                enum audit_state   state;
+                rcu_read_lock();
+                list_for_each_entry_rcu(e, &audit_tsklist, list) {
+                        if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+                                spin_lock(&e->lock);
+                                if (e->deleted) {
+                                        spin_unlock(&e->lock);
+                                        rcu_read_unlock();
+                                        return AUDIT_BUILD_CONTEXT;
+                                }
+                                rcu_read_unlock();
+                                return state;
+                        }
+                }
+                rcu_read_unlock();
+                return AUDIT_BUILD_CONTEXT;
+        }
+Note that this example assumes that entries are only added and deleted.
+Additional mechanism is required to deal correctly with the
+update-in-place performed by audit_upd_rule().  For one thing,
+audit_upd_rule() would need additional memory barriers to ensure
+that the list_add_rcu() was really executed before the list_del_rcu().
+The audit_del_rule() function would need to set the "deleted"
+flag under the spinlock as follows:
+        static inline int audit_del_rule(struct audit_rule *rule,
+                                         struct list_head *list)
+        {
+                struct audit_entry  *e;
+                /* Do not use the _rcu iterator here, since this is the only
+                 * deletion routine. */
+                list_for_each_entry(e, list, list) {
+                        if (!audit_compare_rule(rule, &e->rule)) {
+                                spin_lock(&e->lock);
+                                list_del_rcu(&e->list);
+                                e->deleted = 1;
+                                spin_unlock(&e->lock);
+                                call_rcu(&e->rcu, audit_free_rule, e);
+                                return 0;
+                        }
+                }
+                return -EFAULT;         /* No matching rule */
+        }
+Summary
+Read-mostly list-based data structures that can tolerate stale data are
+the most amenable to use of RCU.  The simplest case is where entries are
+either added or deleted from the data structure (or atomically modified
+in place), but non-atomic in-place modifications can be handled by making
+a copy, updating the copy, then replacing the original with the copy.
+If stale data cannot be tolerated, then a "deleted" flag may be used
+in conjunction with a per-entry spinlock in order to allow the search
+function to reject newly deleted data.
+Answer to Quick Quiz
+If the search function drops the per-entry lock before returning, then
+the caller will be processing stale data in any case.  If it is really
+OK to be processing stale data, then you don't need a "deleted" flag.
+If processing stale data really is a problem, then you need to hold the
+per-entry lock across all of the code that uses the value looked up.

diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt new file mode 100644 index 000000000000..bda6ead69bd0 --- /dev/null +++ b/Documentation/RCU/listRCU.txt
@@ -0,0 +1,307 @@
	1	Using RCU to Protect Read-Mostly Linked Lists
	2
	3
	4	One of the best applications of RCU is to protect read-mostly linked lists
	5	("struct list_head" in list.h). One big advantage of this approach
	6	is that all of the required memory barriers are included for you in
	7	the list macros. This document describes several applications of RCU,
	8	with the best fits first.
	9
	10
	11	Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
	12
	13	The best applications are cases where, if reader-writer locking were
	14	used, the read-side lock would be dropped before taking any action
	15	based on the results of the search. The most celebrated example is
	16	the routing table. Because the routing table is tracking the state of
	17	equipment outside of the computer, it will at times contain stale data.
	18	Therefore, once the route has been computed, there is no need to hold
	19	the routing table static during transmission of the packet. After all,
	20	you can hold the routing table static all you want, but that won't keep
	21	the external Internet from changing, and it is the state of the external
	22	Internet that really matters. In addition, routing entries are typically
	23	added or deleted, rather than being modified in place.
	24
	25	A straightforward example of this use of RCU may be found in the
	26	system-call auditing support. For example, a reader-writer locked
	27	implementation of audit_filter_task() might be as follows:
	28
	29	static enum audit_state audit_filter_task(struct task_struct *tsk)
	30	{
	31	struct audit_entry *e;
	32	enum audit_state state;
	33
	34	read_lock(&auditsc_lock);
	35	list_for_each_entry(e, &audit_tsklist, list) {
	36	if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
	37	read_unlock(&auditsc_lock);
	38	return state;
	39	}
	40	}
	41	read_unlock(&auditsc_lock);
	42	return AUDIT_BUILD_CONTEXT;
	43	}
	44
	45	Here the list is searched under the lock, but the lock is dropped before
	46	the corresponding value is returned. By the time that this value is acted
	47	on, the list may well have been modified. This makes sense, since if
	48	you are turning auditing off, it is OK to audit a few extra system calls.
	49
	50	This means that RCU can be easily applied to the read side, as follows:
	51
	52	static enum audit_state audit_filter_task(struct task_struct *tsk)
	53	{
	54	struct audit_entry *e;
	55	enum audit_state state;
	56
	57	rcu_read_lock();
	58	list_for_each_entry_rcu(e, &audit_tsklist, list) {
	59	if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
	60	rcu_read_unlock();
	61	return state;
	62	}
	63	}
	64	rcu_read_unlock();
	65	return AUDIT_BUILD_CONTEXT;
	66	}
	67
	68	The read_lock() and read_unlock() calls have become rcu_read_lock()
	69	and rcu_read_unlock(), respectively, and the list_for_each_entry() has
	70	become list_for_each_entry_rcu(). The _rcu() list-traversal primitives
	71	insert the read-side memory barriers that are required on DEC Alpha CPUs.
	72
	73	The changes to the update side are also straightforward. A reader-writer
	74	lock might be used as follows for deletion and insertion:
	75
	76	static inline int audit_del_rule(struct audit_rule *rule,
	77	struct list_head *list)
	78	{
	79	struct audit_entry *e;
	80
	81	write_lock(&auditsc_lock);
	82	list_for_each_entry(e, list, list) {
	83	if (!audit_compare_rule(rule, &e->rule)) {
	84	list_del(&e->list);
	85	write_unlock(&auditsc_lock);
	86	return 0;
	87	}
	88	}
	89	write_unlock(&auditsc_lock);
	90	return -EFAULT; /* No matching rule */
	91	}
	92
	93	static inline int audit_add_rule(struct audit_entry *entry,
	94	struct list_head *list)
	95	{
	96	write_lock(&auditsc_lock);
	97	if (entry->rule.flags & AUDIT_PREPEND) {
	98	entry->rule.flags &= ~AUDIT_PREPEND;
	99	list_add(&entry->list, list);
	100	} else {
	101	list_add_tail(&entry->list, list);
	102	}
	103	write_unlock(&auditsc_lock);
	104	return 0;
	105	}
	106
	107	Following are the RCU equivalents for these two functions:
	108
	109	static inline int audit_del_rule(struct audit_rule *rule,
	110	struct list_head *list)
	111	{
	112	struct audit_entry *e;
	113
	114	/* Do not use the _rcu iterator here, since this is the only
	115	* deletion routine. */
	116	list_for_each_entry(e, list, list) {
	117	if (!audit_compare_rule(rule, &e->rule)) {
	118	list_del_rcu(&e->list);
	119	call_rcu(&e->rcu, audit_free_rule, e);
	120	return 0;
	121	}
	122	}
	123	return -EFAULT; /* No matching rule */
	124	}
	125
	126	static inline int audit_add_rule(struct audit_entry *entry,
	127	struct list_head *list)
	128	{
	129	if (entry->rule.flags & AUDIT_PREPEND) {
	130	entry->rule.flags &= ~AUDIT_PREPEND;
	131	list_add_rcu(&entry->list, list);
	132	} else {
	133	list_add_tail_rcu(&entry->list, list);
	134	}
	135	return 0;
	136	}
	137
	138	Normally, the write_lock() and write_unlock() would be replaced by
	139	a spin_lock() and a spin_unlock(), but in this case, all callers hold
	140	audit_netlink_sem, so no additional locking is required. The auditsc_lock
	141	can therefore be eliminated, since use of RCU eliminates the need for
	142	writers to exclude readers.
	143
	144	The list_del(), list_add(), and list_add_tail() primitives have been
	145	replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
	146	The _rcu() list-manipulation primitives add memory barriers that are
	147	needed on weakly ordered CPUs (most of them!).
	148
	149	So, when readers can tolerate stale data and when entries are either added
	150	or deleted, without in-place modification, it is very easy to use RCU!
	151
	152
	153	Example 2: Handling In-Place Updates
	154
	155	The system-call auditing code does not update auditing rules in place.
	156	However, if it did, reader-writer-locked code to do so might look as
	157	follows (presumably, the field_count is only permitted to decrease,
	158	otherwise, the added fields would need to be filled in):
	159
	160	static inline int audit_upd_rule(struct audit_rule *rule,
	161	struct list_head *list,
	162	__u32 newaction,
	163	__u32 newfield_count)
	164	{
	165	struct audit_entry *e;
	166	struct audit_newentry *ne;
	167
	168	write_lock(&auditsc_lock);
	169	list_for_each_entry(e, list, list) {
	170	if (!audit_compare_rule(rule, &e->rule)) {
	171	e->rule.action = newaction;
	172	e->rule.file_count = newfield_count;
	173	write_unlock(&auditsc_lock);
	174	return 0;
	175	}
	176	}
	177	write_unlock(&auditsc_lock);
	178	return -EFAULT; /* No matching rule */
	179	}
	180
	181	The RCU version creates a copy, updates the copy, then replaces the old
	182	entry with the newly updated entry. This sequence of actions, allowing
	183	concurrent reads while doing a copy to perform an update, is what gives
	184	RCU ("read-copy update") its name. The RCU code is as follows:
	185
	186	static inline int audit_upd_rule(struct audit_rule *rule,
	187	struct list_head *list,
	188	__u32 newaction,
	189	__u32 newfield_count)
	190	{
	191	struct audit_entry *e;
	192	struct audit_newentry *ne;
	193
	194	list_for_each_entry(e, list, list) {
	195	if (!audit_compare_rule(rule, &e->rule)) {
	196	ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
	197	if (ne == NULL)
	198	return -ENOMEM;
	199	audit_copy_rule(&ne->rule, &e->rule);
	200	ne->rule.action = newaction;
	201	ne->rule.file_count = newfield_count;
	202	list_add_rcu(ne, e);
	203	list_del(e);
	204	call_rcu(&e->rcu, audit_free_rule, e);
	205	return 0;
	206	}
	207	}
	208	return -EFAULT; /* No matching rule */
	209	}
	210
	211	Again, this assumes that the caller holds audit_netlink_sem. Normally,
	212	the reader-writer lock would become a spinlock in this sort of code.
	213
	214
	215	Example 3: Eliminating Stale Data
	216
	217	The auditing examples above tolerate stale data, as do most algorithms
	218	that are tracking external state. Because there is a delay from the
	219	time the external state changes before Linux becomes aware of the change,
	220	additional RCU-induced staleness is normally not a problem.
	221
	222	However, there are many examples where stale data cannot be tolerated.
	223	One example in the Linux kernel is the System V IPC (see the ipc_lock()
	224	function in ipc/util.c). This code checks a "deleted" flag under a
	225	per-entry spinlock, and, if the "deleted" flag is set, pretends that the
	226	entry does not exist. For this to be helpful, the search function must
	227	return holding the per-entry spinlock, as ipc_lock() does in fact do.
	228
	229	Quick Quiz: Why does the search function need to return holding the
	230	per-entry lock for this deleted-flag technique to be helpful?
	231
	232	If the system-call audit module were to ever need to reject stale data,
	233	one way to accomplish this would be to add a "deleted" flag and a "lock"
	234	spinlock to the audit_entry structure, and modify audit_filter_task()
	235	as follows:
	236
	237	static enum audit_state audit_filter_task(struct task_struct *tsk)
	238	{
	239	struct audit_entry *e;
	240	enum audit_state state;
	241
	242	rcu_read_lock();
	243	list_for_each_entry_rcu(e, &audit_tsklist, list) {
	244	if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
	245	spin_lock(&e->lock);
	246	if (e->deleted) {
	247	spin_unlock(&e->lock);
	248	rcu_read_unlock();
	249	return AUDIT_BUILD_CONTEXT;
	250	}
	251	rcu_read_unlock();
	252	return state;
	253	}
	254	}
	255	rcu_read_unlock();
	256	return AUDIT_BUILD_CONTEXT;
	257	}
	258
	259	Note that this example assumes that entries are only added and deleted.
	260	Additional mechanism is required to deal correctly with the
	261	update-in-place performed by audit_upd_rule(). For one thing,
	262	audit_upd_rule() would need additional memory barriers to ensure
	263	that the list_add_rcu() was really executed before the list_del_rcu().
	264
	265	The audit_del_rule() function would need to set the "deleted"
	266	flag under the spinlock as follows:
	267
	268	static inline int audit_del_rule(struct audit_rule *rule,
	269	struct list_head *list)
	270	{
	271	struct audit_entry *e;
	272
	273	/* Do not use the _rcu iterator here, since this is the only
	274	* deletion routine. */
	275	list_for_each_entry(e, list, list) {
	276	if (!audit_compare_rule(rule, &e->rule)) {
	277	spin_lock(&e->lock);
	278	list_del_rcu(&e->list);
	279	e->deleted = 1;
	280	spin_unlock(&e->lock);
	281	call_rcu(&e->rcu, audit_free_rule, e);
	282	return 0;
	283	}
	284	}
	285	return -EFAULT; /* No matching rule */
	286	}
	287
	288
	289	Summary
	290
	291	Read-mostly list-based data structures that can tolerate stale data are
	292	the most amenable to use of RCU. The simplest case is where entries are
	293	either added or deleted from the data structure (or atomically modified
	294	in place), but non-atomic in-place modifications can be handled by making
	295	a copy, updating the copy, then replacing the original with the copy.
	296	If stale data cannot be tolerated, then a "deleted" flag may be used
	297	in conjunction with a per-entry spinlock in order to allow the search
	298	function to reject newly deleted data.
	299
	300
	301	Answer to Quick Quiz
	302
	303	If the search function drops the per-entry lock before returning, then
	304	the caller will be processing stale data in any case. If it is really
	305	OK to be processing stale data, then you don't need a "deleted" flag.
	306	If processing stale data really is a problem, then you need to hold the
	307	per-entry lock across all of the code that uses the value looked up.