aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorDoug Thompson <dougthompson@xmission.com>2007-07-19 04:50:30 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:57 -0400
commitbf52fa4a26567bfbf5b1d30f84cf0226e61d26cd (patch)
tree29ff1069cb99043f943cf11bc4423051bd42fbfc /drivers/edac
parentfb3fb2068775a1363265edc00870aa5e2f0e3631 (diff)
drivers/edac: fix workq reset deadlock
Fix mutex locking deadlock on the device controller linked list. Was calling a lock then a function that could call the same lock. Moved the cancel workq function to outside the lock Added some short circuit logic in the workq code Added comments of description Code tidying Signed-off-by: Doug Thompson <dougthompson@xmission.com> Cc: Greg KH <greg@kroah.com> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/edac_device.c56
-rw-r--r--drivers/edac/edac_mc.c72
2 files changed, 100 insertions, 28 deletions
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 173f4ba0f7c..7e3723768ac 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -32,7 +32,9 @@
32#include "edac_core.h" 32#include "edac_core.h"
33#include "edac_module.h" 33#include "edac_module.h"
34 34
35/* lock to memory controller's control array 'edac_device_list' */ 35/* lock for the list: 'edac_device_list', manipulation of this list
36 * is protected by the 'device_ctls_mutex' lock
37 */
36static DEFINE_MUTEX(device_ctls_mutex); 38static DEFINE_MUTEX(device_ctls_mutex);
37static struct list_head edac_device_list = LIST_HEAD_INIT(edac_device_list); 39static struct list_head edac_device_list = LIST_HEAD_INIT(edac_device_list);
38 40
@@ -386,6 +388,14 @@ EXPORT_SYMBOL_GPL(edac_device_find);
386/* 388/*
387 * edac_device_workq_function 389 * edac_device_workq_function
388 * performs the operation scheduled by a workq request 390 * performs the operation scheduled by a workq request
391 *
392 * this workq is embedded within an edac_device_ctl_info
393 * structure, that needs to be polled for possible error events.
394 *
395 * This operation is to acquire the list mutex lock
396 * (thus preventing insertation or deletion)
397 * and then call the device's poll function IFF this device is
398 * running polled and there is a poll function defined.
389 */ 399 */
390static void edac_device_workq_function(struct work_struct *work_req) 400static void edac_device_workq_function(struct work_struct *work_req)
391{ 401{
@@ -403,8 +413,17 @@ static void edac_device_workq_function(struct work_struct *work_req)
403 413
404 mutex_unlock(&device_ctls_mutex); 414 mutex_unlock(&device_ctls_mutex);
405 415
406 /* Reschedule */ 416 /* Reschedule the workq for the next time period to start again
407 queue_delayed_work(edac_workqueue, &edac_dev->work, edac_dev->delay); 417 * if the number of msec is for 1 sec, then adjust to the next
418 * whole one second to save timers fireing all over the period
419 * between integral seconds
420 */
421 if (edac_dev->poll_msec == 1000)
422 queue_delayed_work(edac_workqueue, &edac_dev->work,
423 round_jiffies(edac_dev->delay));
424 else
425 queue_delayed_work(edac_workqueue, &edac_dev->work,
426 edac_dev->delay);
408} 427}
409 428
410/* 429/*
@@ -417,11 +436,26 @@ void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev,
417{ 436{
418 debugf0("%s()\n", __func__); 437 debugf0("%s()\n", __func__);
419 438
439 /* take the arg 'msec' and set it into the control structure
440 * to used in the time period calculation
441 * then calc the number of jiffies that represents
442 */
420 edac_dev->poll_msec = msec; 443 edac_dev->poll_msec = msec;
421 edac_dev->delay = msecs_to_jiffies(msec); /* Calc delay jiffies */ 444 edac_dev->delay = msecs_to_jiffies(msec);
422 445
423 INIT_DELAYED_WORK(&edac_dev->work, edac_device_workq_function); 446 INIT_DELAYED_WORK(&edac_dev->work, edac_device_workq_function);
424 queue_delayed_work(edac_workqueue, &edac_dev->work, edac_dev->delay); 447
448 /* optimize here for the 1 second case, which will be normal value, to
449 * fire ON the 1 second time event. This helps reduce all sorts of
450 * timers firing on sub-second basis, while they are happy
451 * to fire together on the 1 second exactly
452 */
453 if (edac_dev->poll_msec == 1000)
454 queue_delayed_work(edac_workqueue, &edac_dev->work,
455 round_jiffies(edac_dev->delay));
456 else
457 queue_delayed_work(edac_workqueue, &edac_dev->work,
458 edac_dev->delay);
425} 459}
426 460
427/* 461/*
@@ -441,16 +475,20 @@ void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev)
441 475
442/* 476/*
443 * edac_device_reset_delay_period 477 * edac_device_reset_delay_period
478 *
479 * need to stop any outstanding workq queued up at this time
480 * because we will be resetting the sleep time.
481 * Then restart the workq on the new delay
444 */ 482 */
445
446void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, 483void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
447 unsigned long value) 484 unsigned long value)
448{ 485{
449 mutex_lock(&device_ctls_mutex); 486 /* cancel the current workq request, without the mutex lock */
450
451 /* cancel the current workq request */
452 edac_device_workq_teardown(edac_dev); 487 edac_device_workq_teardown(edac_dev);
453 488
489 /* acquire the mutex before doing the workq setup */
490 mutex_lock(&device_ctls_mutex);
491
454 /* restart the workq request, with new delay value */ 492 /* restart the workq request, with new delay value */
455 edac_device_workq_setup(edac_dev, value); 493 edac_device_workq_setup(edac_dev, value);
456 494
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 2d53cb38868..4471be36259 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -258,6 +258,12 @@ static void edac_mc_workq_function(struct work_struct *work_req)
258 258
259 mutex_lock(&mem_ctls_mutex); 259 mutex_lock(&mem_ctls_mutex);
260 260
261 /* if this control struct has movd to offline state, we are done */
262 if (mci->op_state == OP_OFFLINE) {
263 mutex_unlock(&mem_ctls_mutex);
264 return;
265 }
266
261 /* Only poll controllers that are running polled and have a check */ 267 /* Only poll controllers that are running polled and have a check */
262 if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL)) 268 if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL))
263 mci->edac_check(mci); 269 mci->edac_check(mci);
@@ -279,11 +285,19 @@ static void edac_mc_workq_function(struct work_struct *work_req)
279 * edac_mc_workq_setup 285 * edac_mc_workq_setup
280 * initialize a workq item for this mci 286 * initialize a workq item for this mci
281 * passing in the new delay period in msec 287 * passing in the new delay period in msec
288 *
289 * locking model:
290 *
291 * called with the mem_ctls_mutex held
282 */ 292 */
283void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) 293static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
284{ 294{
285 debugf0("%s()\n", __func__); 295 debugf0("%s()\n", __func__);
286 296
297 /* if this instance is not in the POLL state, then simply return */
298 if (mci->op_state != OP_RUNNING_POLL)
299 return;
300
287 INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); 301 INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
288 queue_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec)); 302 queue_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
289} 303}
@@ -291,29 +305,39 @@ void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
291/* 305/*
292 * edac_mc_workq_teardown 306 * edac_mc_workq_teardown
293 * stop the workq processing on this mci 307 * stop the workq processing on this mci
308 *
309 * locking model:
310 *
311 * called WITHOUT lock held
294 */ 312 */
295void edac_mc_workq_teardown(struct mem_ctl_info *mci) 313static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
296{ 314{
297 int status; 315 int status;
298 316
299 status = cancel_delayed_work(&mci->work); 317 /* if not running POLL, leave now */
300 if (status == 0) { 318 if (mci->op_state == OP_RUNNING_POLL) {
301 /* workq instance might be running, wait for it */ 319 status = cancel_delayed_work(&mci->work);
302 flush_workqueue(edac_workqueue); 320 if (status == 0) {
321 debugf0("%s() not canceled, flush the queue\n",
322 __func__);
323
324 /* workq instance might be running, wait for it */
325 flush_workqueue(edac_workqueue);
326 }
303 } 327 }
304} 328}
305 329
306/* 330/*
307 * edac_reset_delay_period 331 * edac_reset_delay_period
308 */ 332 */
309 333static void edac_reset_delay_period(struct mem_ctl_info *mci, unsigned long value)
310void edac_reset_delay_period(struct mem_ctl_info *mci, unsigned long value)
311{ 334{
312 mutex_lock(&mem_ctls_mutex);
313
314 /* cancel the current workq request */ 335 /* cancel the current workq request */
315 edac_mc_workq_teardown(mci); 336 edac_mc_workq_teardown(mci);
316 337
338 /* lock the list of devices for the new setup */
339 mutex_lock(&mem_ctls_mutex);
340
317 /* restart the workq request, with new delay value */ 341 /* restart the workq request, with new delay value */
318 edac_mc_workq_setup(mci, value); 342 edac_mc_workq_setup(mci, value);
319 343
@@ -323,6 +347,10 @@ void edac_reset_delay_period(struct mem_ctl_info *mci, unsigned long value)
323/* Return 0 on success, 1 on failure. 347/* Return 0 on success, 1 on failure.
324 * Before calling this function, caller must 348 * Before calling this function, caller must
325 * assign a unique value to mci->mc_idx. 349 * assign a unique value to mci->mc_idx.
350 *
351 * locking model:
352 *
353 * called with the mem_ctls_mutex lock held
326 */ 354 */
327static int add_mc_to_global_list(struct mem_ctl_info *mci) 355static int add_mc_to_global_list(struct mem_ctl_info *mci)
328{ 356{
@@ -331,7 +359,8 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
331 359
332 insert_before = &mc_devices; 360 insert_before = &mc_devices;
333 361
334 if (unlikely((p = find_mci_by_dev(mci->dev)) != NULL)) 362 p = find_mci_by_dev(mci->dev);
363 if (unlikely(p != NULL))
335 goto fail0; 364 goto fail0;
336 365
337 list_for_each(item, &mc_devices) { 366 list_for_each(item, &mc_devices) {
@@ -467,8 +496,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
467 } 496 }
468 497
469 /* Report action taken */ 498 /* Report action taken */
470 edac_mc_printk(mci, KERN_INFO, "Giving out device to %s %s: DEV %s\n", 499 edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
471 mci->mod_name, mci->ctl_name, dev_name(mci)); 500 " DEV %s\n", mci->mod_name, mci->ctl_name, dev_name(mci));
472 501
473 mutex_unlock(&mem_ctls_mutex); 502 mutex_unlock(&mem_ctls_mutex);
474 return 0; 503 return 0;
@@ -493,10 +522,13 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
493{ 522{
494 struct mem_ctl_info *mci; 523 struct mem_ctl_info *mci;
495 524
496 debugf0("MC: %s()\n", __func__); 525 debugf0("%s()\n", __func__);
526
497 mutex_lock(&mem_ctls_mutex); 527 mutex_lock(&mem_ctls_mutex);
498 528
499 if ((mci = find_mci_by_dev(dev)) == NULL) { 529 /* find the requested mci struct in the global list */
530 mci = find_mci_by_dev(dev);
531 if (mci == NULL) {
500 mutex_unlock(&mem_ctls_mutex); 532 mutex_unlock(&mem_ctls_mutex);
501 return NULL; 533 return NULL;
502 } 534 }
@@ -504,15 +536,17 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
504 /* marking MCI offline */ 536 /* marking MCI offline */
505 mci->op_state = OP_OFFLINE; 537 mci->op_state = OP_OFFLINE;
506 538
507 /* flush workq processes */
508 edac_mc_workq_teardown(mci);
509
510 edac_remove_sysfs_mci_device(mci);
511 del_mc_from_global_list(mci); 539 del_mc_from_global_list(mci);
512 mutex_unlock(&mem_ctls_mutex); 540 mutex_unlock(&mem_ctls_mutex);
541
542 /* flush workq processes and remove sysfs */
543 edac_mc_workq_teardown(mci);
544 edac_remove_sysfs_mci_device(mci);
545
513 edac_printk(KERN_INFO, EDAC_MC, 546 edac_printk(KERN_INFO, EDAC_MC,
514 "Removed device %d for %s %s: DEV %s\n", mci->mc_idx, 547 "Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
515 mci->mod_name, mci->ctl_name, dev_name(mci)); 548 mci->mod_name, mci->ctl_name, dev_name(mci));
549
516 return mci; 550 return mci;
517} 551}
518EXPORT_SYMBOL_GPL(edac_mc_del_mc); 552EXPORT_SYMBOL_GPL(edac_mc_del_mc);