diff options
author | Anton Blanchard <anton@samba.org> | 2010-05-16 16:22:31 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2010-07-08 21:28:35 -0400 |
commit | 41eab6f88f24124df89e38067b3766b7bef06ddb (patch) | |
tree | 41ec35970c76adbba1558b6243d80be669062136 | |
parent | a591f6b56d6fbd7d1951e352fe5b0acf6b91e497 (diff) |
powerpc/numa: Use form 1 affinity to setup node distance
Form 1 affinity allows multiple entries in ibm,associativity-reference-points
which represent affinity domains in decreasing order of importance. The
Linux concept of a node is always the first entry, but using the other
values as an input to node_distance() allows the memory allocator to make
better decisions on which node to go first when local memory has been
exhausted.
We keep things simple and create an array indexed by NUMA node, capped at
4 entries. Each time we lookup an associativity property we initialise
the array which is overkill, but since we should only hit this path during
boot it didn't seem worth adding a per node valid bit.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r-- | arch/powerpc/include/asm/topology.h | 3 | ||||
-rw-r--r-- | arch/powerpc/mm/numa.c | 122 |
2 files changed, 92 insertions, 33 deletions
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 32adf728072..3033c1b3074 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h | |||
@@ -87,6 +87,9 @@ static inline int pcibus_to_node(struct pci_bus *bus) | |||
87 | .balance_interval = 1, \ | 87 | .balance_interval = 1, \ |
88 | } | 88 | } |
89 | 89 | ||
90 | extern int __node_distance(int, int); | ||
91 | #define node_distance(a, b) __node_distance(a, b) | ||
92 | |||
90 | extern void __init dump_numa_cpu_topology(void); | 93 | extern void __init dump_numa_cpu_topology(void); |
91 | 94 | ||
92 | extern int sysfs_add_device_to_node(struct sys_device *dev, int nid); | 95 | extern int sysfs_add_device_to_node(struct sys_device *dev, int nid); |
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 80d110635d2..f78f19e0a2a 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c | |||
@@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data); | |||
42 | 42 | ||
43 | static int min_common_depth; | 43 | static int min_common_depth; |
44 | static int n_mem_addr_cells, n_mem_size_cells; | 44 | static int n_mem_addr_cells, n_mem_size_cells; |
45 | static int form1_affinity; | ||
46 | |||
47 | #define MAX_DISTANCE_REF_POINTS 4 | ||
48 | static int distance_ref_points_depth; | ||
49 | static const unsigned int *distance_ref_points; | ||
50 | static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; | ||
45 | 51 | ||
46 | /* | 52 | /* |
47 | * Allocate node_to_cpumask_map based on number of available nodes | 53 | * Allocate node_to_cpumask_map based on number of available nodes |
@@ -204,6 +210,39 @@ static const u32 *of_get_usable_memory(struct device_node *memory) | |||
204 | return prop; | 210 | return prop; |
205 | } | 211 | } |
206 | 212 | ||
213 | int __node_distance(int a, int b) | ||
214 | { | ||
215 | int i; | ||
216 | int distance = LOCAL_DISTANCE; | ||
217 | |||
218 | if (!form1_affinity) | ||
219 | return distance; | ||
220 | |||
221 | for (i = 0; i < distance_ref_points_depth; i++) { | ||
222 | if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) | ||
223 | break; | ||
224 | |||
225 | /* Double the distance for each NUMA level */ | ||
226 | distance *= 2; | ||
227 | } | ||
228 | |||
229 | return distance; | ||
230 | } | ||
231 | |||
232 | static void initialize_distance_lookup_table(int nid, | ||
233 | const unsigned int *associativity) | ||
234 | { | ||
235 | int i; | ||
236 | |||
237 | if (!form1_affinity) | ||
238 | return; | ||
239 | |||
240 | for (i = 0; i < distance_ref_points_depth; i++) { | ||
241 | distance_lookup_table[nid][i] = | ||
242 | associativity[distance_ref_points[i]]; | ||
243 | } | ||
244 | } | ||
245 | |||
207 | /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa | 246 | /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa |
208 | * info is found. | 247 | * info is found. |
209 | */ | 248 | */ |
@@ -225,6 +264,10 @@ static int of_node_to_nid_single(struct device_node *device) | |||
225 | /* POWER4 LPAR uses 0xffff as invalid node */ | 264 | /* POWER4 LPAR uses 0xffff as invalid node */ |
226 | if (nid == 0xffff || nid >= MAX_NUMNODES) | 265 | if (nid == 0xffff || nid >= MAX_NUMNODES) |
227 | nid = -1; | 266 | nid = -1; |
267 | |||
268 | if (nid > 0 && tmp[0] >= distance_ref_points_depth) | ||
269 | initialize_distance_lookup_table(nid, tmp); | ||
270 | |||
228 | out: | 271 | out: |
229 | return nid; | 272 | return nid; |
230 | } | 273 | } |
@@ -251,26 +294,10 @@ int of_node_to_nid(struct device_node *device) | |||
251 | } | 294 | } |
252 | EXPORT_SYMBOL_GPL(of_node_to_nid); | 295 | EXPORT_SYMBOL_GPL(of_node_to_nid); |
253 | 296 | ||
254 | /* | ||
255 | * In theory, the "ibm,associativity" property may contain multiple | ||
256 | * associativity lists because a resource may be multiply connected | ||
257 | * into the machine. This resource then has different associativity | ||
258 | * characteristics relative to its multiple connections. We ignore | ||
259 | * this for now. We also assume that all cpu and memory sets have | ||
260 | * their distances represented at a common level. This won't be | ||
261 | * true for hierarchical NUMA. | ||
262 | * | ||
263 | * In any case the ibm,associativity-reference-points should give | ||
264 | * the correct depth for a normal NUMA system. | ||
265 | * | ||
266 | * - Dave Hansen <haveblue@us.ibm.com> | ||
267 | */ | ||
268 | static int __init find_min_common_depth(void) | 297 | static int __init find_min_common_depth(void) |
269 | { | 298 | { |
270 | int depth, index; | 299 | int depth; |
271 | const unsigned int *ref_points; | ||
272 | struct device_node *rtas_root; | 300 | struct device_node *rtas_root; |
273 | unsigned int len; | ||
274 | struct device_node *chosen; | 301 | struct device_node *chosen; |
275 | const char *vec5; | 302 | const char *vec5; |
276 | 303 | ||
@@ -280,18 +307,28 @@ static int __init find_min_common_depth(void) | |||
280 | return -1; | 307 | return -1; |
281 | 308 | ||
282 | /* | 309 | /* |
283 | * this property is 2 32-bit integers, each representing a level of | 310 | * This property is a set of 32-bit integers, each representing |
284 | * depth in the associativity nodes. The first is for an SMP | 311 | * an index into the ibm,associativity nodes. |
285 | * configuration (should be all 0's) and the second is for a normal | 312 | * |
286 | * NUMA configuration. | 313 | * With form 0 affinity the first integer is for an SMP configuration |
314 | * (should be all 0's) and the second is for a normal NUMA | ||
315 | * configuration. We have only one level of NUMA. | ||
316 | * | ||
317 | * With form 1 affinity the first integer is the most significant | ||
318 | * NUMA boundary and the following are progressively less significant | ||
319 | * boundaries. There can be more than one level of NUMA. | ||
287 | */ | 320 | */ |
288 | index = 1; | 321 | distance_ref_points = of_get_property(rtas_root, |
289 | ref_points = of_get_property(rtas_root, | 322 | "ibm,associativity-reference-points", |
290 | "ibm,associativity-reference-points", &len); | 323 | &distance_ref_points_depth); |
324 | |||
325 | if (!distance_ref_points) { | ||
326 | dbg("NUMA: ibm,associativity-reference-points not found.\n"); | ||
327 | goto err; | ||
328 | } | ||
329 | |||
330 | distance_ref_points_depth /= sizeof(int); | ||
291 | 331 | ||
292 | /* | ||
293 | * For form 1 affinity information we want the first field | ||
294 | */ | ||
295 | #define VEC5_AFFINITY_BYTE 5 | 332 | #define VEC5_AFFINITY_BYTE 5 |
296 | #define VEC5_AFFINITY 0x80 | 333 | #define VEC5_AFFINITY 0x80 |
297 | chosen = of_find_node_by_path("/chosen"); | 334 | chosen = of_find_node_by_path("/chosen"); |
@@ -299,19 +336,38 @@ static int __init find_min_common_depth(void) | |||
299 | vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL); | 336 | vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL); |
300 | if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) { | 337 | if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) { |
301 | dbg("Using form 1 affinity\n"); | 338 | dbg("Using form 1 affinity\n"); |
302 | index = 0; | 339 | form1_affinity = 1; |
303 | } | 340 | } |
304 | } | 341 | } |
305 | 342 | ||
306 | if ((len >= 2 * sizeof(unsigned int)) && ref_points) { | 343 | if (form1_affinity) { |
307 | depth = ref_points[index]; | 344 | depth = distance_ref_points[0]; |
308 | } else { | 345 | } else { |
309 | dbg("NUMA: ibm,associativity-reference-points not found.\n"); | 346 | if (distance_ref_points_depth < 2) { |
310 | depth = -1; | 347 | printk(KERN_WARNING "NUMA: " |
348 | "short ibm,associativity-reference-points\n"); | ||
349 | goto err; | ||
350 | } | ||
351 | |||
352 | depth = distance_ref_points[1]; | ||
311 | } | 353 | } |
312 | of_node_put(rtas_root); | ||
313 | 354 | ||
355 | /* | ||
356 | * Warn and cap if the hardware supports more than | ||
357 | * MAX_DISTANCE_REF_POINTS domains. | ||
358 | */ | ||
359 | if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { | ||
360 | printk(KERN_WARNING "NUMA: distance array capped at " | ||
361 | "%d entries\n", MAX_DISTANCE_REF_POINTS); | ||
362 | distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; | ||
363 | } | ||
364 | |||
365 | of_node_put(rtas_root); | ||
314 | return depth; | 366 | return depth; |
367 | |||
368 | err: | ||
369 | of_node_put(rtas_root); | ||
370 | return -1; | ||
315 | } | 371 | } |
316 | 372 | ||
317 | static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) | 373 | static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) |