diff options
-rw-r--r-- | Documentation/RCU/rculist_nulls.txt | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt new file mode 100644 index 000000000000..239f542d48ba --- /dev/null +++ b/Documentation/RCU/rculist_nulls.txt | |||
@@ -0,0 +1,167 @@ | |||
1 | Using hlist_nulls to protect read-mostly linked lists and | ||
2 | objects using SLAB_DESTROY_BY_RCU allocations. | ||
3 | |||
4 | Please read the basics in Documentation/RCU/listRCU.txt | ||
5 | |||
6 | Using special makers (called 'nulls') is a convenient way | ||
7 | to solve following problem : | ||
8 | |||
9 | A typical RCU linked list managing objects which are | ||
10 | allocated with SLAB_DESTROY_BY_RCU kmem_cache can | ||
11 | use following algos : | ||
12 | |||
13 | 1) Lookup algo | ||
14 | -------------- | ||
15 | rcu_read_lock() | ||
16 | begin: | ||
17 | obj = lockless_lookup(key); | ||
18 | if (obj) { | ||
19 | if (!try_get_ref(obj)) // might fail for free objects | ||
20 | goto begin; | ||
21 | /* | ||
22 | * Because a writer could delete object, and a writer could | ||
23 | * reuse these object before the RCU grace period, we | ||
24 | * must check key after geting the reference on object | ||
25 | */ | ||
26 | if (obj->key != key) { // not the object we expected | ||
27 | put_ref(obj); | ||
28 | goto begin; | ||
29 | } | ||
30 | } | ||
31 | rcu_read_unlock(); | ||
32 | |||
33 | Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu() | ||
34 | but a version with an additional memory barrier (smp_rmb()) | ||
35 | |||
36 | lockless_lookup(key) | ||
37 | { | ||
38 | struct hlist_node *node, *next; | ||
39 | for (pos = rcu_dereference((head)->first); | ||
40 | pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && | ||
41 | ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); | ||
42 | pos = rcu_dereference(next)) | ||
43 | if (obj->key == key) | ||
44 | return obj; | ||
45 | return NULL; | ||
46 | |||
47 | And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() : | ||
48 | |||
49 | struct hlist_node *node; | ||
50 | for (pos = rcu_dereference((head)->first); | ||
51 | pos && ({ prefetch(pos->next); 1; }) && | ||
52 | ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); | ||
53 | pos = rcu_dereference(pos->next)) | ||
54 | if (obj->key == key) | ||
55 | return obj; | ||
56 | return NULL; | ||
57 | } | ||
58 | |||
59 | Quoting Corey Minyard : | ||
60 | |||
61 | "If the object is moved from one list to another list in-between the | ||
62 | time the hash is calculated and the next field is accessed, and the | ||
63 | object has moved to the end of a new list, the traversal will not | ||
64 | complete properly on the list it should have, since the object will | ||
65 | be on the end of the new list and there's not a way to tell it's on a | ||
66 | new list and restart the list traversal. I think that this can be | ||
67 | solved by pre-fetching the "next" field (with proper barriers) before | ||
68 | checking the key." | ||
69 | |||
70 | 2) Insert algo : | ||
71 | ---------------- | ||
72 | |||
73 | We need to make sure a reader cannot read the new 'obj->obj_next' value | ||
74 | and previous value of 'obj->key'. Or else, an item could be deleted | ||
75 | from a chain, and inserted into another chain. If new chain was empty | ||
76 | before the move, 'next' pointer is NULL, and lockless reader can | ||
77 | not detect it missed following items in original chain. | ||
78 | |||
79 | /* | ||
80 | * Please note that new inserts are done at the head of list, | ||
81 | * not in the middle or end. | ||
82 | */ | ||
83 | obj = kmem_cache_alloc(...); | ||
84 | lock_chain(); // typically a spin_lock() | ||
85 | obj->key = key; | ||
86 | atomic_inc(&obj->refcnt); | ||
87 | /* | ||
88 | * we need to make sure obj->key is updated before obj->next | ||
89 | */ | ||
90 | smp_wmb(); | ||
91 | hlist_add_head_rcu(&obj->obj_node, list); | ||
92 | unlock_chain(); // typically a spin_unlock() | ||
93 | |||
94 | |||
95 | 3) Remove algo | ||
96 | -------------- | ||
97 | Nothing special here, we can use a standard RCU hlist deletion. | ||
98 | But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused | ||
99 | very very fast (before the end of RCU grace period) | ||
100 | |||
101 | if (put_last_reference_on(obj) { | ||
102 | lock_chain(); // typically a spin_lock() | ||
103 | hlist_del_init_rcu(&obj->obj_node); | ||
104 | unlock_chain(); // typically a spin_unlock() | ||
105 | kmem_cache_free(cachep, obj); | ||
106 | } | ||
107 | |||
108 | |||
109 | |||
110 | -------------------------------------------------------------------------- | ||
111 | With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup() | ||
112 | and extra smp_wmb() in insert function. | ||
113 | |||
114 | For example, if we choose to store the slot number as the 'nulls' | ||
115 | end-of-list marker for each slot of the hash table, we can detect | ||
116 | a race (some writer did a delete and/or a move of an object | ||
117 | to another chain) checking the final 'nulls' value if | ||
118 | the lookup met the end of chain. If final 'nulls' value | ||
119 | is not the slot number, then we must restart the lookup at | ||
120 | the begining. If the object was moved to same chain, | ||
121 | then the reader doesnt care : It might eventually | ||
122 | scan the list again without harm. | ||
123 | |||
124 | |||
125 | 1) lookup algo | ||
126 | |||
127 | head = &table[slot]; | ||
128 | rcu_read_lock(); | ||
129 | begin: | ||
130 | hlist_nulls_for_each_entry_rcu(obj, node, head, member) { | ||
131 | if (obj->key == key) { | ||
132 | if (!try_get_ref(obj)) // might fail for free objects | ||
133 | goto begin; | ||
134 | if (obj->key != key) { // not the object we expected | ||
135 | put_ref(obj); | ||
136 | goto begin; | ||
137 | } | ||
138 | goto out; | ||
139 | } | ||
140 | /* | ||
141 | * if the nulls value we got at the end of this lookup is | ||
142 | * not the expected one, we must restart lookup. | ||
143 | * We probably met an item that was moved to another chain. | ||
144 | */ | ||
145 | if (get_nulls_value(node) != slot) | ||
146 | goto begin; | ||
147 | obj = NULL; | ||
148 | |||
149 | out: | ||
150 | rcu_read_unlock(); | ||
151 | |||
152 | 2) Insert function : | ||
153 | -------------------- | ||
154 | |||
155 | /* | ||
156 | * Please note that new inserts are done at the head of list, | ||
157 | * not in the middle or end. | ||
158 | */ | ||
159 | obj = kmem_cache_alloc(cachep); | ||
160 | lock_chain(); // typically a spin_lock() | ||
161 | obj->key = key; | ||
162 | atomic_set(&obj->refcnt, 1); | ||
163 | /* | ||
164 | * insert obj in RCU way (readers might be traversing chain) | ||
165 | */ | ||
166 | hlist_nulls_add_head_rcu(&obj->obj_node, list); | ||
167 | unlock_chain(); // typically a spin_unlock() | ||