aboutsummaryrefslogblamecommitdiffstats
path: root/kernel/sched_fair.c
blob: 3816f217f119ef635cf169f268b5140c6e0f66c6 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
















                                                                    


                                                                          

   

                             
  
                                                   
                                                          
  
                                                             


                                                                
  

                                                                   
   
                                                

  
                                                      
                                                            
   
                                                       

  

                                                                 
                                         



                                                           
   
                                                           

  







                                                                 
                                   
                                                            




                                                                      
                                                         
 

                                                                

                                                 



                                                               




                                                                  
                              
 
                                                   

                                                     
                          

 

                                                        
 















































                                                                                










































                                                                              
                                     
 


                                                     



                                 

                                     
 
                                                               
 
                                

 
































                                                                            




                                                                     

                                     




                                                               
                                                              
 

                                                   




                                        
                                                              







                                                   
                                                                            
 
                                                   

 











                                                                       
                                  







                                                                            


                                      
                                                                            



                                                                
                                         











                                                                        
                                                      










                                                                      
                     
                                                    


                                                                

 
                                                                            
 

                                                   


                                                   
         
 
                                                         

 

                                                                     





                                                             

 
                                                                     
 
                                                                
 

                            

                                                             

 



                                                               















                                                                               

  
             



                                                             

                                                                       




                     






                                                                             


                                                   
                                                    

                                                
                                                      
                                     




                      



                                                                    
                
   
                                                                      
 
                                                                    
 
                                   



                                         
 








                                                                      

 
  
                                                           
  
           
   
                                                                       
 
                                                            


  



                                                                        

                                                               
 
                                          
 
                                                                            

                                             
                                                      
                                                                
                                              
                                    

 
                                              
 
                                                 
                                       









                                                              
                                                             

                        
 

                                                




                                                            
                                                                
         


                  
                                                                       
 
                                                            

 


                                         
                                                                                
 



                                                               
                               
                                                    

 
           
                                                                     
 

                                                                


                                                               
                                         


                  
                                                                    
 



                                                          
                               
                                                  





                                                        
                                                                       



                                            
                                              

 



                                                   












                                                                



                                                                      

                                                             
                                 
                                                             

                                                          







                                                                      

                                                             
                                 
                                                              

                                               



                             
                                                                           
 

                              
                                                                   
                                                      








                                                    

                                                               

                              
                                                                   
                                                      








                                                    






                                                                           
 


                                                                             
                                                               



         












                                                                        
           

                                                                         
                                            
 





                                                                         
                                               
                                                     
 
                       
                                                               



                                                                    



                                                                            
                           

                                                                          



                                                                     
 

                                                                          

         
                                


           
                                                                          

          
                                                       
           
                            
                                           
 
                     
                                            
                                            
         
 
                                         
                                 

                                             

 
                                                                           







                                    





                                                                         
           
                                                                         
 




                                                       
                                         
                    
                        



                                                              
                                                                       
                                                              
                                                                       
                 
      

         
                                  
 
                               

                                             
                                    




                                                              
           
                                                                    
 

                                                
                                                  
                                                                          
                                         
                                                  





                                                                          

 
           
                                                               
 










                                                                          
                                            
                          





                                                                  
                                                              



                                                                          
                                                         

 


                                                                          
                                                                   
 

                                                             

                                                                        
 



                                                                        

 
                                                                             





                                                           
                                    
 
                                   
                          
                                                      


                                                       
                            

 

                                                                         
 
          
                                                       
           
                            
 




                                                                         



                                                  







                                                                         
                                                                  
                                                 





                                                   


                                                                   



















                                                                           
                                  
                                                           
 
                                        

         















                                                                
                                



                                                       



                                               

      




                                                          
                                                                               

                              
                                         

                                   
                              

                                       
                                                   
                           
         
 
                          






                                                         
                                                                              

                              
                                         


                                       
                                                  
                                                                              
                                        
                              
                          
         
 
                          


  


                                                                       
   
                                          
 


                                                        

          




                                              

                                  
                                                                                
                                    
                  
                                                               
                   
                                    




                                                  
           
                                               


                                             
                                                                       



                                                                

                                                                     
           
                                               


  



                                                              
                                                              
                                                   





                                                    

                                

















                                                                              









                                                                           
                                                             


                                  


                                                                  


                                                                   












                                                                           
                                    






                                                           
 
                              




















                                                                               

                                                          
 
                                              




                          





                                                                       
                                   
                                    








                                                                           


                                         
                                         
 

                                
 




                                






                                                                
                       
         
 
                  
 
 
     
 

                                                                          
 
                  
 
 

      
          
                                                             

                                                                        

                                   

                                                 

                                     
                             
                     
 
                                                                              

                         



                                                                          
          



                                                              






                                                                     
 

                                   
 

                                                                             

          


                                                              
           

                             



                                                        

                                                                         












                                                         

                                                               
                                                 
                                        
                                      
                           
                               
                
 
                                      
                                             
                                           
                                   
 

                                 



                                                  
                                       
                                                                        




                                     
                                                                    
                         



                                                                       
                     
                         
 



                                                             
                                          

                                               
                                                                           


                                                                 







                                                                  
                                        


                 
    



                                     





























                                                                              


                                                             


                                                               
          

                                                                      
           
















                                                                              




                    




















                                                                         
                                     





                         

                                                   



                                                        



                                                   



                                                        

 
  

                                                              
                                                                                

                                            
                                                           
                                                  
 
                            
 
                                         


                                   
 


                                                          


                                









                                                                              

                                   
 






                                                                           
          
                                                                             

                     
                                                
                       
 


                                                                  
                       
         
 

                                        
 


                                                                            



                                   












                                                          

 
                                                             
 
                              






                                          
                                              



                                                                           
                                            
                                            


                                          



                                 




                                  
                                                                       





                                            
                                            


         
                 










                                                                 
                           
                                                                      
 

                                     
 


                                   


                                                               
 






                                                             
                                                                   





                                                            
                                                                         

 




                                                                              
 
                                           
 


                                                        
 


                                                            
 
 
                              
                    
                                                                       
                                              

                                                                   
 
                                           

                                          
 
                        
                                   
 
                                                         
                                                                        

                                                                           
                                         
 


                              
                                                 

                                 

                                                                 
 
                                                                            
                                                                               
                                                         
 
                                

                                 
                                             
                                                                     
 

                                            

                              
                          
 
                                             
 











                                                                       
 






















                                                                            
                       
 


                                                         
                                                                               





                                            
                                                









                                                                
                                                               

                                               
                                                               
                                          


                             
                            
                                    
 
                                                                           
                                                                       
                                                                
                  


                                                                             
                                                   
                                       
         
 
                                    

 















                                                                        
                                             















                                                                  
                                             

 












                                                                      









                                                   


                                    

                                                    



                                                    
                                                       



                                                      
                 

                                                      
                                                    
                                                     
      
 
                                                     

                                                 


                                                    



                                                   


                         
                                                        
 

                              
                        
                                                 
                                             
                          

      
/*
 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
 *
 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 *  Interactivity improvements by Mike Galbraith
 *  (C) 2007 Mike Galbraith <efault@gmx.de>
 *
 *  Various enhancements by Dmitry Adamushko.
 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
 *
 *  Group scheduling enhancements by Srivatsa Vaddagiri
 *  Copyright IBM Corporation, 2007
 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
 *
 *  Scaled math optimizations by Thomas Gleixner
 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
 *
 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 */

#include <linux/latencytop.h>

/*
 * Targeted preemption latency for CPU-bound tasks:
 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
 * and have no persistent notion like in traditional, time-slice
 * based scheduling concepts.
 *
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
 */
unsigned int sysctl_sched_latency = 20000000ULL;

/*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
unsigned int sysctl_sched_min_granularity = 4000000ULL;

/*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
static unsigned int sched_nr_latency = 5;

/*
 * After fork, child runs first. (default) If set to 0 then
 * parent will (try to) run first.
 */
const_debug unsigned int sysctl_sched_child_runs_first = 1;

/*
 * sys_sched_yield() compat mode
 *
 * This option switches the agressive yield implementation of the
 * old scheduler back on.
 */
unsigned int __read_mostly sysctl_sched_compat_yield;

/*
 * SCHED_OTHER wake-up granularity.
 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
unsigned int sysctl_sched_wakeup_granularity = 5000000UL;

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

static const struct sched_class fair_sched_class;

/**************************************************************
 * CFS operations on generic schedulable entities:
 */

static inline struct task_struct *task_of(struct sched_entity *se)
{
	return container_of(se, struct task_struct, se);
}

#ifdef CONFIG_FAIR_GROUP_SCHED

/* cpu runqueue to which this cfs_rq is attached */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
	return cfs_rq->rq;
}

/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se)	(!se->my_q)

/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
		for (; se; se = se->parent)

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
	return p->se.cfs_rq;
}

/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	return se->cfs_rq;
}

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return grp->my_q;
}

/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 * another cpu ('this_cpu')
 */
static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
{
	return cfs_rq->tg->cfs_rq[this_cpu];
}

/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
	if (se->cfs_rq == pse->cfs_rq)
		return 1;

	return 0;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
	return se->parent;
}

/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se(struct sched_entity *se)
{
	int depth = 0;

	for_each_sched_entity(se)
		depth++;

	return depth;
}

static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
	int se_depth, pse_depth;

	/*
	 * preemption test can be made between sibling entities who are in the
	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
	 * both tasks until we find their ancestors who are siblings of common
	 * parent.
	 */

	/* First walk up until both entities are at same depth */
	se_depth = depth_se(*se);
	pse_depth = depth_se(*pse);

	while (se_depth > pse_depth) {
		se_depth--;
		*se = parent_entity(*se);
	}

	while (pse_depth > se_depth) {
		pse_depth--;
		*pse = parent_entity(*pse);
	}

	while (!is_same_group(*se, *pse)) {
		*se = parent_entity(*se);
		*pse = parent_entity(*pse);
	}
}

#else	/* CONFIG_FAIR_GROUP_SCHED */

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
	return container_of(cfs_rq, struct rq, cfs);
}

#define entity_is_task(se)	1

#define for_each_sched_entity(se) \
		for (; se; se = NULL)

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
	return &task_rq(p)->cfs;
}

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	struct task_struct *p = task_of(se);
	struct rq *rq = task_rq(p);

	return &rq->cfs;
}

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return NULL;
}

static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
{
	return &cpu_rq(this_cpu)->cfs;
}

#define for_each_leaf_cfs_rq(rq, cfs_rq) \
		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
	return 1;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
	return NULL;
}

static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}

#endif	/* CONFIG_FAIR_GROUP_SCHED */


/**************************************************************
 * Scheduling class tree data structure manipulation methods:
 */

static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
{
	s64 delta = (s64)(vruntime - min_vruntime);
	if (delta > 0)
		min_vruntime = vruntime;

	return min_vruntime;
}

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
	s64 delta = (s64)(vruntime - min_vruntime);
	if (delta < 0)
		min_vruntime = vruntime;

	return min_vruntime;
}

static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return se->vruntime - cfs_rq->min_vruntime;
}

static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
	u64 vruntime = cfs_rq->min_vruntime;

	if (cfs_rq->curr)
		vruntime = cfs_rq->curr->vruntime;

	if (cfs_rq->rb_leftmost) {
		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
						   struct sched_entity,
						   run_node);

		if (!cfs_rq->curr)
			vruntime = se->vruntime;
		else
			vruntime = min_vruntime(vruntime, se->vruntime);
	}

	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
}

/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	s64 key = entity_key(cfs_rq, se);
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (key < entity_key(cfs_rq, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);
		cfs_rq->rb_leftmost = next_node;
	}

	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct rb_node *left = cfs_rq->rb_leftmost;

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
}

static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);

	if (!last)
		return NULL;

	return rb_entry(last, struct sched_entity, run_node);
}

/**************************************************************
 * Scheduling class statistics methods:
 */

#ifdef CONFIG_SCHED_DEBUG
int sched_nr_latency_handler(struct ctl_table *table, int write,
		struct file *filp, void __user *buffer, size_t *lenp,
		loff_t *ppos)
{
	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);

	if (ret || !write)
		return ret;

	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
					sysctl_sched_min_granularity);

	return 0;
}
#endif

/*
 * delta /= w
 */
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);

	return delta;
}

/*
 * The idea is to set a period in which each task runs once.
 *
 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
 * this period because otherwise the slices get too small.
 *
 * p = (nr <= nl) ? l : l*nr/nl
 */
static u64 __sched_period(unsigned long nr_running)
{
	u64 period = sysctl_sched_latency;
	unsigned long nr_latency = sched_nr_latency;

	if (unlikely(nr_running > nr_latency)) {
		period = sysctl_sched_min_granularity;
		period *= nr_running;
	}

	return period;
}

/*
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
 * s = p*P[w/rw]
 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);

	for_each_sched_entity(se) {
		struct load_weight *load;

		cfs_rq = cfs_rq_of(se);
		load = &cfs_rq->load;

		if (unlikely(!se->on_rq)) {
			struct load_weight lw = cfs_rq->load;

			update_load_add(&lw, se->load.weight);
			load = &lw;
		}
		slice = calc_delta_mine(slice, se->load.weight, load);
	}
	return slice;
}

/*
 * We calculate the vruntime slice of a to be inserted task
 *
 * vs = s/w
 */
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return calc_delta_fair(sched_slice(cfs_rq, se), se);
}

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
	      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;

	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq, exec_clock, delta_exec);
	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
	curr->vruntime += delta_exec_weighted;
	update_min_vruntime(cfs_rq);
}

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_of(cfs_rq)->clock;
	unsigned long delta_exec;

	if (unlikely(!curr))
		return;

	/*
	 * Get the amount of time the current task was running
	 * since the last time we changed load (this cannot
	 * overflow on 32 bits):
	 */
	delta_exec = (unsigned long)(now - curr->exec_start);
	if (!delta_exec)
		return;

	__update_curr(cfs_rq, curr, delta_exec);
	curr->exec_start = now;

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		cpuacct_charge(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}
}

static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
}

/*
 * Task is being enqueued - update stats:
 */
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/*
	 * Are we enqueueing a waiting task? (for current tasks
	 * a dequeue/enqueue event is a NOP)
	 */
	if (se != cfs_rq->curr)
		update_stats_wait_start(cfs_rq, se);
}

static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	schedstat_set(se->wait_max, max(se->wait_max,
			rq_of(cfs_rq)->clock - se->wait_start));
	schedstat_set(se->wait_count, se->wait_count + 1);
	schedstat_set(se->wait_sum, se->wait_sum +
			rq_of(cfs_rq)->clock - se->wait_start);
	schedstat_set(se->wait_start, 0);
}

static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/*
	 * Mark the end of the wait period if dequeueing a
	 * waiting task:
	 */
	if (se != cfs_rq->curr)
		update_stats_wait_end(cfs_rq, se);
}

/*
 * We are picking a new current task - update its stats:
 */
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/*
	 * We are starting a new run period:
	 */
	se->exec_start = rq_of(cfs_rq)->clock;
}

/**************************************************
 * Scheduling class queueing methods:
 */

#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
static void
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
{
	cfs_rq->task_weight += weight;
}
#else
static inline void
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
{
}
#endif

static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	update_load_add(&cfs_rq->load, se->load.weight);
	if (!parent_entity(se))
		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
	if (entity_is_task(se)) {
		add_cfs_task_weight(cfs_rq, se->load.weight);
		list_add(&se->group_node, &cfs_rq->tasks);
	}
	cfs_rq->nr_running++;
	se->on_rq = 1;
}

static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	update_load_sub(&cfs_rq->load, se->load.weight);
	if (!parent_entity(se))
		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
	if (entity_is_task(se)) {
		add_cfs_task_weight(cfs_rq, -se->load.weight);
		list_del_init(&se->group_node);
	}
	cfs_rq->nr_running--;
	se->on_rq = 0;
}

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
	if (se->sleep_start) {
		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
		struct task_struct *tsk = task_of(se);

		if ((s64)delta < 0)
			delta = 0;

		if (unlikely(delta > se->sleep_max))
			se->sleep_max = delta;

		se->sleep_start = 0;
		se->sum_sleep_runtime += delta;

		account_scheduler_latency(tsk, delta >> 10, 1);
	}
	if (se->block_start) {
		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
		struct task_struct *tsk = task_of(se);

		if ((s64)delta < 0)
			delta = 0;

		if (unlikely(delta > se->block_max))
			se->block_max = delta;

		se->block_start = 0;
		se->sum_sleep_runtime += delta;

		/*
		 * Blocking time is in units of nanosecs, so shift by 20 to
		 * get a milliseconds-range estimation of the amount of
		 * time that the task spent sleeping:
		 */
		if (unlikely(prof_on == SLEEP_PROFILING)) {

			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
				     delta >> 20);
		}
		account_scheduler_latency(tsk, delta >> 10, 0);
	}
#endif
}

static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
	s64 d = se->vruntime - cfs_rq->min_vruntime;

	if (d < 0)
		d = -d;

	if (d > 3*sysctl_sched_latency)
		schedstat_inc(cfs_rq, nr_spread_over);
#endif
}

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	u64 vruntime = cfs_rq->min_vruntime;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice(cfs_rq, se);

	if (!initial) {
		/* sleeps upto a single latency don't count. */
		if (sched_feat(NEW_FAIR_SLEEPERS)) {
			unsigned long thresh = sysctl_sched_latency;

			/*
			 * Convert the sleeper threshold into virtual time.
			 * SCHED_IDLE is a special sub-class.  We care about
			 * fairness only relative to other SCHED_IDLE tasks,
			 * all of which have the same weight.
			 */
			if (sched_feat(NORMALIZED_SLEEPER) &&
					task_of(se)->policy != SCHED_IDLE)
				thresh = calc_delta_fair(thresh, se);

			vruntime -= thresh;
		}

		/* ensure we never gain time by being placed backwards. */
		vruntime = max_vruntime(se->vruntime, vruntime);
	}

	se->vruntime = vruntime;
}

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);
	account_entity_enqueue(cfs_rq, se);

	if (wakeup) {
		place_entity(cfs_rq, se, 0);
		enqueue_sleeper(cfs_rq, se);
	}

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);
}

static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (cfs_rq->last == se)
		cfs_rq->last = NULL;

	if (cfs_rq->next == se)
		cfs_rq->next = NULL;
}

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	for_each_sched_entity(se)
		__clear_buddies(cfs_rq_of(se), se);
}

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	update_stats_dequeue(cfs_rq, se);
	if (sleep) {
#ifdef CONFIG_SCHEDSTATS
		if (entity_is_task(se)) {
			struct task_struct *tsk = task_of(se);

			if (tsk->state & TASK_INTERRUPTIBLE)
				se->sleep_start = rq_of(cfs_rq)->clock;
			if (tsk->state & TASK_UNINTERRUPTIBLE)
				se->block_start = rq_of(cfs_rq)->clock;
		}
#endif
	}

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	account_entity_dequeue(cfs_rq, se);
	update_min_vruntime(cfs_rq);
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime) {
		resched_task(rq_of(cfs_rq)->curr);
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr);
	}
}

static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/* 'current' is not kept within the tree. */
	if (se->on_rq) {
		/*
		 * Any task has to be enqueued before it get to execute on
		 * a CPU. So account for the time it spent waiting on the
		 * runqueue.
		 */
		update_stats_wait_end(cfs_rq, se);
		__dequeue_entity(cfs_rq, se);
	}

	update_stats_curr_start(cfs_rq, se);
	cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
	/*
	 * Track our maximum slice length, if the CPU's load is at
	 * least twice that of our own weight (i.e. dont track it
	 * when there are only lesser-weight tasks around):
	 */
	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
		se->slice_max = max(se->slice_max,
			se->sum_exec_runtime - se->prev_sum_exec_runtime);
	}
#endif
	se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);

static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct sched_entity *se = __pick_next_entity(cfs_rq);

	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
		return cfs_rq->next;

	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
		return cfs_rq->last;

	return se;
}

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
	/*
	 * If still on the runqueue then deactivate_task()
	 * was not called and update_curr() has to be done:
	 */
	if (prev->on_rq)
		update_curr(cfs_rq);

	check_spread(cfs_rq, prev);
	if (prev->on_rq) {
		update_stats_wait_start(cfs_rq, prev);
		/* Put 'current' back into the tree. */
		__enqueue_entity(cfs_rq, prev);
	}
	cfs_rq->curr = NULL;
}

static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

#ifdef CONFIG_SCHED_HRTICK
	/*
	 * queued ticks are scheduled to match the slice, so don't bother
	 * validating it and just reschedule.
	 */
	if (queued) {
		resched_task(rq_of(cfs_rq)->curr);
		return;
	}
	/*
	 * don't let the period tick interfere with the hrtick preemption
	 */
	if (!sched_feat(DOUBLE_TICK) &&
			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
		return;
#endif

	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
		check_preempt_tick(cfs_rq, curr);
}

/**************************************************
 * CFS operations on tasks:
 */

#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
	struct sched_entity *se = &p->se;
	struct cfs_rq *cfs_rq = cfs_rq_of(se);

	WARN_ON(task_rq(p) != rq);

	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
		u64 slice = sched_slice(cfs_rq, se);
		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
		s64 delta = slice - ran;

		if (delta < 0) {
			if (rq->curr == p)
				resched_task(p);
			return;
		}

		/*
		 * Don't schedule slices shorter than 10000ns, that just
		 * doesn't make sense. Rely on vruntime for fairness.
		 */
		if (rq->curr != p)
			delta = max_t(s64, 10000LL, delta);

		hrtick_start(rq, delta);
	}
}

/*
 * called from enqueue/dequeue and updates the hrtick when the
 * current task is from our class and nr_running is low enough
 * to matter.
 */
static void hrtick_update(struct rq *rq)
{
	struct task_struct *curr = rq->curr;

	if (curr->sched_class != &fair_sched_class)
		return;

	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
		hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}

static inline void hrtick_update(struct rq *rq)
{
}
#endif

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, wakeup);
		wakeup = 1;
	}

	hrtick_update(rq);
}

/*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, sleep);
		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight)
			break;
		sleep = 1;
	}

	hrtick_update(rq);
}

/*
 * sched_yield() support is very simple - we dequeue and enqueue.
 *
 * If compat_yield is turned on then we requeue to the end of the tree.
 */
static void yield_task_fair(struct rq *rq)
{
	struct task_struct *curr = rq->curr;
	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
	struct sched_entity *rightmost, *se = &curr->se;

	/*
	 * Are we the only task in the tree?
	 */
	if (unlikely(cfs_rq->nr_running == 1))
		return;

	clear_buddies(cfs_rq, se);

	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
		update_rq_clock(rq);
		/*
		 * Update run-time statistics of the 'current'.
		 */
		update_curr(cfs_rq);

		return;
	}
	/*
	 * Find the rightmost entry in the rbtree:
	 */
	rightmost = __pick_last_entity(cfs_rq);
	/*
	 * Already in the rightmost position?
	 */
	if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
		return;

	/*
	 * Minimally necessary key value to be last in the tree:
	 * Upon rescheduling, sched_class::put_prev_task() will place
	 * 'current' within the tree based on its new key value.
	 */
	se->vruntime = rightmost->vruntime + 1;
}

/*
 * wake_idle() will wake a task on an idle cpu if task->cpu is
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
 * Domains may include CPUs that are not usable for migration,
 * hence we need to mask them out (cpu_active_mask)
 *
 * Returns the CPU we should wake onto.
 */
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
	struct sched_domain *sd;
	int i;
	unsigned int chosen_wakeup_cpu;
	int this_cpu;

	/*
	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
	 * are idle and this is not a kernel thread and this task's affinity
	 * allows it to be moved to preferred cpu, then just move!
	 */

	this_cpu = smp_processor_id();
	chosen_wakeup_cpu =
		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;

	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
		idle_cpu(cpu) && idle_cpu(this_cpu) &&
		p->mm && !(p->flags & PF_KTHREAD) &&
		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
		return chosen_wakeup_cpu;

	/*
	 * If it is idle, then it is the best cpu to run this task.
	 *
	 * This cpu is also the best, if it has more than one task already.
	 * Siblings must be also busy(in most cases) as they didn't already
	 * pickup the extra load from this cpu and hence we need not check
	 * sibling runqueue info. This will avoid the checks and cache miss
	 * penalities associated with that.
	 */
	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
		return cpu;

	for_each_domain(cpu, sd) {
		if ((sd->flags & SD_WAKE_IDLE)
		    || ((sd->flags & SD_WAKE_IDLE_FAR)
			&& !task_hot(p, task_rq(p)->clock, sd))) {
			for_each_cpu_and(i, sched_domain_span(sd),
					 &p->cpus_allowed) {
				if (cpu_active(i) && idle_cpu(i)) {
					if (i != task_cpu(p)) {
						schedstat_inc(p,
						       se.nr_wakeups_idle);
					}
					return i;
				}
			}
		} else {
			break;
		}
	}
	return cpu;
}
#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
static inline int wake_idle(int cpu, struct task_struct *p)
{
	return cpu;
}
#endif

#ifdef CONFIG_SMP

#ifdef CONFIG_FAIR_GROUP_SCHED
/*
 * effective_load() calculates the load change as seen from the root_task_group
 *
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
 *
 * The problem is that perfectly aligning the shares is rather expensive, hence
 * we try to avoid doing that too often - see update_shares(), which ratelimits
 * this change.
 *
 * We compensate this by not only taking the current delta into account, but
 * also considering the delta between when the shares were last adjusted and
 * now.
 *
 * We still saw a performance dip, some tracing learned us that between
 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
 * significantly. Therefore try to bias the error in direction of failing
 * the affine wakeup.
 *
 */
static long effective_load(struct task_group *tg, int cpu,
		long wl, long wg)
{
	struct sched_entity *se = tg->se[cpu];

	if (!tg->parent)
		return wl;

	/*
	 * By not taking the decrease of shares on the other cpu into
	 * account our error leans towards reducing the affine wakeups.
	 */
	if (!wl && sched_feat(ASYM_EFF_LOAD))
		return wl;

	for_each_sched_entity(se) {
		long S, rw, s, a, b;
		long more_w;

		/*
		 * Instead of using this increment, also add the difference
		 * between when the shares were last updated and now.
		 */
		more_w = se->my_q->load.weight - se->my_q->rq_weight;
		wl += more_w;
		wg += more_w;

		S = se->my_q->tg->shares;
		s = se->my_q->shares;
		rw = se->my_q->rq_weight;

		a = S*(rw + wl);
		b = S*rw + s*wg;

		wl = s*(a-b);

		if (likely(b))
			wl /= b;

		/*
		 * Assume the group is already running and will
		 * thus already be accounted for in the weight.
		 *
		 * That is, moving shares between CPUs, does not
		 * alter the group weight.
		 */
		wg = 0;
	}

	return wl;
}

#else

static inline unsigned long effective_load(struct task_group *tg, int cpu,
		unsigned long wl, unsigned long wg)
{
	return wl;
}

#endif

static int
wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
	    int idx, unsigned long load, unsigned long this_load,
	    unsigned int imbalance)
{
	struct task_struct *curr = this_rq->curr;
	struct task_group *tg;
	unsigned long tl = this_load;
	unsigned long tl_per_task;
	unsigned long weight;
	int balanced;

	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
		return 0;

	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
			p->se.avg_overlap > sysctl_sched_migration_cost))
		sync = 0;

	/*
	 * If sync wakeup then subtract the (maximum possible)
	 * effect of the currently running task from the load
	 * of the current CPU:
	 */
	if (sync) {
		tg = task_group(current);
		weight = current->se.load.weight;

		tl += effective_load(tg, this_cpu, -weight, -weight);
		load += effective_load(tg, prev_cpu, 0, -weight);
	}

	tg = task_group(p);
	weight = p->se.load.weight;

	balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));

	/*
	 * If the currently running task will sleep within
	 * a reasonable amount of time then attract this newly
	 * woken task:
	 */
	if (sync && balanced)
		return 1;

	schedstat_inc(p, se.nr_wakeups_affine_attempts);
	tl_per_task = cpu_avg_load_per_task(this_cpu);

	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
			tl_per_task)) {
		/*
		 * This domain has SD_WAKE_AFFINE and
		 * p is cache cold in this domain, and
		 * there is no bad imbalance.
		 */
		schedstat_inc(this_sd, ttwu_move_affine);
		schedstat_inc(p, se.nr_wakeups_affine);

		return 1;
	}
	return 0;
}

static int select_task_rq_fair(struct task_struct *p, int sync)
{
	struct sched_domain *sd, *this_sd = NULL;
	int prev_cpu, this_cpu, new_cpu;
	unsigned long load, this_load;
	struct rq *this_rq;
	unsigned int imbalance;
	int idx;

	prev_cpu	= task_cpu(p);
	this_cpu	= smp_processor_id();
	this_rq		= cpu_rq(this_cpu);
	new_cpu		= prev_cpu;

	if (prev_cpu == this_cpu)
		goto out;
	/*
	 * 'this_sd' is the first domain that both
	 * this_cpu and prev_cpu are present in:
	 */
	for_each_domain(this_cpu, sd) {
		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
			this_sd = sd;
			break;
		}
	}

	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
		goto out;

	/*
	 * Check for affine wakeup and passive balancing possibilities.
	 */
	if (!this_sd)
		goto out;

	idx = this_sd->wake_idx;

	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

	load = source_load(prev_cpu, idx);
	this_load = target_load(this_cpu, idx);

	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
				     load, this_load, imbalance))
		return this_cpu;

	/*
	 * Start passive balancing when half the imbalance_pct
	 * limit is reached.
	 */
	if (this_sd->flags & SD_WAKE_BALANCE) {
		if (imbalance*this_load <= 100*load) {
			schedstat_inc(this_sd, ttwu_move_balance);
			schedstat_inc(p, se.nr_wakeups_passive);
			return this_cpu;
		}
	}

out:
	return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */

/*
 * Adaptive granularity
 *
 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
 * with the limit of wakeup_gran -- when it never does a wakeup.
 *
 * So the smaller avg_wakeup is the faster we want this task to preempt,
 * but we don't want to treat the preemptee unfairly and therefore allow it
 * to run for at least the amount of time we'd like to run.
 *
 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
 *
 * NOTE: we use *nr_running to scale with load, this nicely matches the
 *       degrading latency on load.
 */
static unsigned long
adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
{
	u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
	u64 gran = 0;

	if (this_run < expected_wakeup)
		gran = expected_wakeup - this_run;

	return min_t(s64, gran, sysctl_sched_wakeup_granularity);
}

static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
	unsigned long gran = sysctl_sched_wakeup_granularity;

	if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
		gran = adaptive_gran(curr, se);

	/*
	 * Since its curr running now, convert the gran from real-time
	 * to virtual-time in his units.
	 */
	if (sched_feat(ASYM_GRAN)) {
		/*
		 * By using 'se' instead of 'curr' we penalize light tasks, so
		 * they get preempted easier. That is, if 'se' < 'curr' then
		 * the resulting gran will be larger, therefore penalizing the
		 * lighter, if otoh 'se' > 'curr' then the resulting gran will
		 * be smaller, again penalizing the lighter task.
		 *
		 * This is especially important for buddies when the leftmost
		 * task is higher priority than the buddy.
		 */
		if (unlikely(se->load.weight != NICE_0_LOAD))
			gran = calc_delta_fair(gran, se);
	} else {
		if (unlikely(curr->load.weight != NICE_0_LOAD))
			gran = calc_delta_fair(gran, curr);
	}

	return gran;
}

/*
 * Should 'se' preempt 'curr'.
 *
 *             |s1
 *        |s2
 *   |s3
 *         g
 *      |<--->|c
 *
 *  w(c, s1) = -1
 *  w(c, s2) =  0
 *  w(c, s3) =  1
 *
 */
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
	s64 gran, vdiff = curr->vruntime - se->vruntime;

	if (vdiff <= 0)
		return -1;

	gran = wakeup_gran(curr, se);
	if (vdiff > gran)
		return 1;

	return 0;
}

static void set_last_buddy(struct sched_entity *se)
{
	if (likely(task_of(se)->policy != SCHED_IDLE)) {
		for_each_sched_entity(se)
			cfs_rq_of(se)->last = se;
	}
}

static void set_next_buddy(struct sched_entity *se)
{
	if (likely(task_of(se)->policy != SCHED_IDLE)) {
		for_each_sched_entity(se)
			cfs_rq_of(se)->next = se;
	}
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
	struct task_struct *curr = rq->curr;
	struct sched_entity *se = &curr->se, *pse = &p->se;
	struct cfs_rq *cfs_rq = task_cfs_rq(curr);

	update_curr(cfs_rq);

	if (unlikely(rt_prio(p->prio))) {
		resched_task(curr);
		return;
	}

	if (unlikely(p->sched_class != &fair_sched_class))
		return;

	if (unlikely(se == pse))
		return;

	/*
	 * Only set the backward buddy when the current task is still on the
	 * rq. This can happen when a wakeup gets interleaved with schedule on
	 * the ->pre_schedule() or idle_balance() point, either of which can
	 * drop the rq lock.
	 *
	 * Also, during early boot the idle thread is in the fair class, for
	 * obvious reasons its a bad idea to schedule back to the idle thread.
	 */
	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
		set_last_buddy(se);
	set_next_buddy(pse);

	/*
	 * We can come here with TIF_NEED_RESCHED already set from new task
	 * wake up path.
	 */
	if (test_tsk_need_resched(curr))
		return;

	/*
	 * Batch and idle tasks do not preempt (their preemption is driven by
	 * the tick):
	 */
	if (unlikely(p->policy != SCHED_NORMAL))
		return;

	/* Idle tasks are by definition preempted by everybody. */
	if (unlikely(curr->policy == SCHED_IDLE)) {
		resched_task(curr);
		return;
	}

	if (!sched_feat(WAKEUP_PREEMPT))
		return;

	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
			(se->avg_overlap < sysctl_sched_migration_cost &&
			 pse->avg_overlap < sysctl_sched_migration_cost))) {
		resched_task(curr);
		return;
	}

	find_matching_se(&se, &pse);

	while (se) {
		BUG_ON(!pse);

		if (wakeup_preempt_entity(se, pse) == 1) {
			resched_task(curr);
			break;
		}

		se = parent_entity(se);
		pse = parent_entity(pse);
	}
}

static struct task_struct *pick_next_task_fair(struct rq *rq)
{
	struct task_struct *p;
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;

	if (unlikely(!cfs_rq->nr_running))
		return NULL;

	do {
		se = pick_next_entity(cfs_rq);
		/*
		 * If se was a buddy, clear it so that it will have to earn
		 * the favour again.
		 */
		__clear_buddies(cfs_rq, se);
		set_next_entity(cfs_rq, se);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	p = task_of(se);
	hrtick_start_fair(rq, p);

	return p;
}

/*
 * Account for a descheduled task:
 */
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
	struct sched_entity *se = &prev->se;
	struct cfs_rq *cfs_rq;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		put_prev_entity(cfs_rq, se);
	}
}

#ifdef CONFIG_SMP
/**************************************************
 * Fair scheduling class load-balancing methods:
 */

/*
 * Load-balancing iterator. Note: while the runqueue stays locked
 * during the whole iteration, the current task might be
 * dequeued so the iterator has to be dequeue-safe. Here we
 * achieve that by always pre-iterating before returning
 * the current task:
 */
static struct task_struct *
__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
{
	struct task_struct *p = NULL;
	struct sched_entity *se;

	if (next == &cfs_rq->tasks)
		return NULL;

	se = list_entry(next, struct sched_entity, group_node);
	p = task_of(se);
	cfs_rq->balance_iterator = next->next;

	return p;
}

static struct task_struct *load_balance_start_fair(void *arg)
{
	struct cfs_rq *cfs_rq = arg;

	return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
}

static struct task_struct *load_balance_next_fair(void *arg)
{
	struct cfs_rq *cfs_rq = arg;

	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}

static unsigned long
__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
		unsigned long max_load_move, struct sched_domain *sd,
		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
		struct cfs_rq *cfs_rq)
{
	struct rq_iterator cfs_rq_iterator;

	cfs_rq_iterator.start = load_balance_start_fair;
	cfs_rq_iterator.next = load_balance_next_fair;
	cfs_rq_iterator.arg = cfs_rq;

	return balance_tasks(this_rq, this_cpu, busiest,
			max_load_move, sd, idle, all_pinned,
			this_best_prio, &cfs_rq_iterator);
}

#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
		  unsigned long max_load_move,
		  struct sched_domain *sd, enum cpu_idle_type idle,
		  int *all_pinned, int *this_best_prio)
{
	long rem_load_move = max_load_move;
	int busiest_cpu = cpu_of(busiest);
	struct task_group *tg;

	rcu_read_lock();
	update_h_load(busiest_cpu);

	list_for_each_entry_rcu(tg, &task_groups, list) {
		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
		u64 rem_load, moved_load;

		/*
		 * empty group
		 */
		if (!busiest_cfs_rq->task_weight)
			continue;

		rem_load = (u64)rem_load_move * busiest_weight;
		rem_load = div_u64(rem_load, busiest_h_load + 1);

		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
				rem_load, sd, idle, all_pinned, this_best_prio,
				tg->cfs_rq[busiest_cpu]);

		if (!moved_load)
			continue;

		moved_load *= busiest_h_load;
		moved_load = div_u64(moved_load, busiest_weight + 1);

		rem_load_move -= moved_load;
		if (rem_load_move < 0)
			break;
	}
	rcu_read_unlock();

	return max_load_move - rem_load_move;
}
#else
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
		  unsigned long max_load_move,
		  struct sched_domain *sd, enum cpu_idle_type idle,
		  int *all_pinned, int *this_best_prio)
{
	return __load_balance_fair(this_rq, this_cpu, busiest,
			max_load_move, sd, idle, all_pinned,
			this_best_prio, &busiest->cfs);
}
#endif

static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
		   struct sched_domain *sd, enum cpu_idle_type idle)
{
	struct cfs_rq *busy_cfs_rq;
	struct rq_iterator cfs_rq_iterator;

	cfs_rq_iterator.start = load_balance_start_fair;
	cfs_rq_iterator.next = load_balance_next_fair;

	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
		/*
		 * pass busy_cfs_rq argument into
		 * load_balance_[start|next]_fair iterators
		 */
		cfs_rq_iterator.arg = busy_cfs_rq;
		if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
				       &cfs_rq_iterator))
		    return 1;
	}

	return 0;
}
#endif /* CONFIG_SMP */

/*
 * scheduler tick hitting a task of our scheduling class:
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se, queued);
	}
}

/*
 * Share the fairness runtime between parent and child, thus the
 * total amount of pressure for CPU stays equal - new tasks
 * get a chance to run but frequent forkers are not allowed to
 * monopolize the CPU. Note: the parent runqueue is locked,
 * the child is not running yet.
 */
static void task_new_fair(struct rq *rq, struct task_struct *p)
{
	struct cfs_rq *cfs_rq = task_cfs_rq(p);
	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
	int this_cpu = smp_processor_id();

	sched_info_queued(p);

	update_curr(cfs_rq);
	place_entity(cfs_rq, se, 1);

	/* 'curr' will be NULL if the child belongs to a different group */
	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
			curr && curr->vruntime < se->vruntime) {
		/*
		 * Upon rescheduling, sched_class::put_prev_task() will place
		 * 'current' within the tree based on its new key value.
		 */
		swap(curr->vruntime, se->vruntime);
		resched_task(rq->curr);
	}

	enqueue_task_fair(rq, p, 0);
}

/*
 * Priority of the task has changed. Check to see if we preempt
 * the current task.
 */
static void prio_changed_fair(struct rq *rq, struct task_struct *p,
			      int oldprio, int running)
{
	/*
	 * Reschedule if we are currently running on this runqueue and
	 * our priority decreased, or if we are not currently running on
	 * this runqueue and our priority is higher than the current's
	 */
	if (running) {
		if (p->prio > oldprio)
			resched_task(rq->curr);
	} else
		check_preempt_curr(rq, p, 0);
}

/*
 * We switched to the sched_fair class.
 */
static void switched_to_fair(struct rq *rq, struct task_struct *p,
			     int running)
{
	/*
	 * We were most likely switched from sched_rt, so
	 * kick off the schedule if running, otherwise just see
	 * if we can still preempt the current task.
	 */
	if (running)
		resched_task(rq->curr);
	else
		check_preempt_curr(rq, p, 0);
}

/* Account for a task changing its policy or group.
 *
 * This routine is mostly called to set cfs_rq->curr field when a task
 * migrates between groups/classes.
 */
static void set_curr_task_fair(struct rq *rq)
{
	struct sched_entity *se = &rq->curr->se;

	for_each_sched_entity(se)
		set_next_entity(cfs_rq_of(se), se);
}

#ifdef CONFIG_FAIR_GROUP_SCHED
static void moved_group_fair(struct task_struct *p)
{
	struct cfs_rq *cfs_rq = task_cfs_rq(p);

	update_curr(cfs_rq);
	place_entity(cfs_rq, &p->se, 1);
}
#endif

/*
 * All the scheduling class methods:
 */
static const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_fair,

	.load_balance		= load_balance_fair,
	.move_one_task		= move_one_task_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_new		= task_new_fair,

	.prio_changed		= prio_changed_fair,
	.switched_to		= switched_to_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.moved_group		= moved_group_fair,
#endif
};

#ifdef CONFIG_SCHED_DEBUG
static void print_cfs_stats(struct seq_file *m, int cpu)
{
	struct cfs_rq *cfs_rq;

	rcu_read_lock();
	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
		print_cfs_rq(m, cpu, cfs_rq);
	rcu_read_unlock();
}
#endif
>= (unsigned long) &_end, addr = (unsigned long) obj; #ifdef CONFIG_SMP int i; #endif /* * static variable? */ if ((addr >= start) && (addr < end)) return 1; #ifdef CONFIG_SMP /* * percpu var? */ for_each_possible_cpu(i) { start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + per_cpu_offset(i); if ((addr >= start) && (addr < end)) return 1; } #endif /* * module var? */ return is_module_address(addr); } /* * To make lock name printouts unique, we calculate a unique * class->name_version generation counter: */ static int count_matching_names(struct lock_class *new_class) { struct lock_class *class; int count = 0; if (!new_class->name) return 0; list_for_each_entry(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) count = max(count, class->name_version); } return count + 1; } /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP /* * If the architecture calls into lockdep before initializing * the hashes then we'll warn about it later. (we cannot printk * right now) */ if (unlikely(!lockdep_initialized)) { lockdep_init(); lockdep_init_error = 1; save_stack_trace(&lockdep_init_trace); } #endif /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: */ if (unlikely(!lock->key)) lock->key = (void *)lock; /* * NOTE: the class-key must be unique. For dynamic locks, a static * lock_class_key variable is passed in through the mutex_init() * (or spin_lock_init()) call - which acts as the key. For static * locks we use the lock object itself as the key. */ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lockdep_map)); key = lock->key->subkeys + subclass; hash_head = classhashentry(key); /* * We can walk the hash lockfree, because the hash only * grows, and we are careful when adding entries to the end: */ list_for_each_entry(class, hash_head, hash_entry) { if (class->key == key) { WARN_ON_ONCE(class->name != lock->name); return class; } } return NULL; } /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; unsigned long flags; class = look_up_lock_class(lock, subclass); if (likely(class)) return class; /* * Debug-check: all keys must be persistent! */ if (!static_obj(lock->key)) { debug_locks_off(); printk("INFO: trying to register non-static key.\n"); printk("the code is fine but needs lockdep annotation.\n"); printk("turning off the locking correctness validator.\n"); dump_stack(); return NULL; } key = lock->key->subkeys + subclass; hash_head = classhashentry(key); raw_local_irq_save(flags); if (!graph_lock()) { raw_local_irq_restore(flags); return NULL; } /* * We have to do the hash-walk again, to avoid races * with another CPU: */ list_for_each_entry(class, hash_head, hash_entry) if (class->key == key) goto out_unlock_set; /* * Allocate a new key from the static array, and add it to * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { if (!debug_locks_off_graph_unlock()) { raw_local_irq_restore(flags); return NULL; } raw_local_irq_restore(flags); printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); printk("turning off the locking correctness validator.\n"); return NULL; } class = lock_classes + nr_lock_classes++; debug_atomic_inc(&nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; INIT_LIST_HEAD(&class->lock_entry); INIT_LIST_HEAD(&class->locks_before); INIT_LIST_HEAD(&class->locks_after); class->name_version = count_matching_names(class); /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ list_add_tail_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ list_add_tail_rcu(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) printk("#%d", class->name_version); printk("\n"); dump_stack(); raw_local_irq_save(flags); if (!graph_lock()) { raw_local_irq_restore(flags); return NULL; } } out_unlock_set: graph_unlock(); raw_local_irq_restore(flags); if (!subclass || force) lock->class_cache = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; return class; } #ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns * with NULL on failure) */ static struct lock_list *alloc_list_entry(void) { if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { if (!debug_locks_off_graph_unlock()) return NULL; printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); printk("turning off the locking correctness validator.\n"); return NULL; } return list_entries + nr_list_entries++; } /* * Add a new dependency to the head of the list: */ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, struct list_head *head, unsigned long ip, int distance) { struct lock_list *entry; /* * Lock not present yet - get a new dependency struct and * add it to the list: */ entry = alloc_list_entry(); if (!entry) return 0; if (!save_trace(&entry->trace)) return 0; entry->class = this; entry->distance = distance; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation * that must be protected by the spinlock. But this also means * we must make new entries visible only once writes to the * entry become visible - hence the RCU op: */ list_add_tail_rcu(&entry->entry, head); return 1; } /* * Recursive, forwards-direction lock-dependency checking, used for * both noncyclic checking and for hardirq-unsafe/softirq-unsafe * checking. * * (to keep the stackframe of the recursive functions small we * use these global variables, and we also mark various helper * functions as noinline.) */ static struct held_lock *check_source, *check_target; /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ static noinline int print_circular_bug_entry(struct lock_list *target, unsigned int depth) { if (debug_locks_silent) return 0; printk("\n-> #%u", depth); print_lock_name(target->class); printk(":\n"); print_stack_trace(&target->trace, 6); return 0; } /* * When a circular dependency is detected, print the * header first: */ static noinline int print_circular_bug_header(struct lock_list *entry, unsigned int depth) { struct task_struct *curr = current; if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=======================================================\n"); printk( "[ INFO: possible circular locking dependency detected ]\n"); print_kernel_version(); printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_source); printk("\nbut task is already holding lock:\n"); print_lock(check_target); printk("\nwhich lock already depends on the new lock.\n\n"); printk("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); return 0; } static noinline int print_circular_bug_tail(void) { struct task_struct *curr = current; struct lock_list this; if (debug_locks_silent) return 0; this.class = hlock_class(check_source); if (!save_trace(&this.trace)) return 0; print_circular_bug_entry(&this, 0); printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); return 0; } #define RECURSION_LIMIT 40 static int noinline print_infinite_recursion_bug(void) { if (!debug_locks_off_graph_unlock()) return 0; WARN_ON(1); return 0; } unsigned long __lockdep_count_forward_deps(struct lock_class *class, unsigned int depth) { struct lock_list *entry; unsigned long ret = 1; if (lockdep_dependency_visit(class, depth)) return 0; /* * Recurse this class's dependency list: */ list_for_each_entry(entry, &class->locks_after, entry) ret += __lockdep_count_forward_deps(entry->class, depth + 1); return ret; } unsigned long lockdep_count_forward_deps(struct lock_class *class) { unsigned long ret, flags; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(class, 0); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; } unsigned long __lockdep_count_backward_deps(struct lock_class *class, unsigned int depth) { struct lock_list *entry; unsigned long ret = 1; if (lockdep_dependency_visit(class, depth)) return 0; /* * Recurse this class's dependency list: */ list_for_each_entry(entry, &class->locks_before, entry) ret += __lockdep_count_backward_deps(entry->class, depth + 1); return ret; } unsigned long lockdep_count_backward_deps(struct lock_class *class) { unsigned long ret, flags; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(class, 0); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; } /* * Prove that the dependency graph starting at <entry> can not * lead to <target>. Print an error and return 0 if it does. */ static noinline int check_noncircular(struct lock_class *source, unsigned int depth) { struct lock_list *entry; if (lockdep_dependency_visit(source, depth)) return 1; debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); /* * Check this lock's dependency list: */ list_for_each_entry(entry, &source->locks_after, entry) { if (entry->class == hlock_class(check_target)) return print_circular_bug_header(entry, depth+1); debug_atomic_inc(&nr_cyclic_checks); if (!check_noncircular(entry->class, depth+1)) return print_circular_bug_entry(entry, depth+1); } return 1; } #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ static enum lock_usage_bit find_usage_bit; static struct lock_class *forwards_match, *backwards_match; /* * Find a node in the forwards-direction dependency sub-graph starting * at <source> that matches <find_usage_bit>. * * Return 2 if such a node exists in the subgraph, and put that node * into <forwards_match>. * * Return 1 otherwise and keep <forwards_match> unchanged. * Return 0 on error. */ static noinline int find_usage_forwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; int ret; if (lockdep_dependency_visit(source, depth)) return 1; if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_forwards_checks); if (source->usage_mask & (1 << find_usage_bit)) { forwards_match = source; return 2; } /* * Check this lock's dependency list: */ list_for_each_entry(entry, &source->locks_after, entry) { debug_atomic_inc(&nr_find_usage_forwards_recursions); ret = find_usage_forwards(entry->class, depth+1); if (ret == 2 || ret == 0) return ret; } return 1; } /* * Find a node in the backwards-direction dependency sub-graph starting * at <source> that matches <find_usage_bit>. * * Return 2 if such a node exists in the subgraph, and put that node * into <backwards_match>. * * Return 1 otherwise and keep <backwards_match> unchanged. * Return 0 on error. */ static noinline int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; int ret; if (lockdep_dependency_visit(source, depth)) return 1; if (!__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_backwards_checks); if (source->usage_mask & (1 << find_usage_bit)) { backwards_match = source; return 2; } if (!source && debug_locks_off_graph_unlock()) { WARN_ON(1); return 0; } /* * Check this lock's dependency list: */ list_for_each_entry(entry, &source->locks_before, entry) { debug_atomic_inc(&nr_find_usage_backwards_recursions); ret = find_usage_backwards(entry->class, depth+1); if (ret == 2 || ret == 0) return ret; } return 1; } static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, enum lock_usage_bit bit1, enum lock_usage_bit bit2, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n======================================================\n"); printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); print_kernel_version(); printk( "------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, curr->hardirqs_enabled, curr->softirqs_enabled); print_lock(next); printk("\nand this task is already holding:\n"); print_lock(prev); printk("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); printk(" ->"); print_lock_name(hlock_class(next)); printk("\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); print_lock_name(backwards_match); printk("\n... which became %s-irq-safe at:\n", irqclass); print_stack_trace(backwards_match->usage_traces + bit1, 1); printk("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_match); printk("\n... which became %s-irq-unsafe at:\n", irqclass); printk("..."); print_stack_trace(forwards_match->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); print_lock_dependencies(backwards_match, 0); printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); print_lock_dependencies(forwards_match, 0); printk("\nstack backtrace:\n"); dump_stack(); return 0; } static int check_usage(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, enum lock_usage_bit bit_backwards, enum lock_usage_bit bit_forwards, const char *irqclass) { int ret; find_usage_bit = bit_backwards; /* fills in <backwards_match> */ ret = find_usage_backwards(hlock_class(prev), 0); if (!ret || ret == 1) return ret; find_usage_bit = bit_forwards; ret = find_usage_forwards(hlock_class(next), 0); if (!ret || ret == 1) return ret; /* ret == 2 */ return print_bad_irq_dependency(curr, prev, next, bit_backwards, bit_forwards, irqclass); } static const char *state_names[] = { #define LOCKDEP_STATE(__STATE) \ __stringify(__STATE), #include "lockdep_states.h" #undef LOCKDEP_STATE }; static const char *state_rnames[] = { #define LOCKDEP_STATE(__STATE) \ __stringify(__STATE)"-READ", #include "lockdep_states.h" #undef LOCKDEP_STATE }; static inline const char *state_name(enum lock_usage_bit bit) { return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; } static int exclusive_bit(int new_bit) { /* * USED_IN * USED_IN_READ * ENABLED * ENABLED_READ * * bit 0 - write/read * bit 1 - used_in/enabled * bit 2+ state */ int state = new_bit & ~3; int dir = new_bit & 2; /* * keep state, bit flip the direction and strip read. */ return state | (dir ^ 2); } static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, enum lock_usage_bit bit) { /* * Prove that the new dependency does not connect a hardirq-safe * lock with a hardirq-unsafe lock - to achieve this we search * the backwards-subgraph starting at <prev>, and the * forwards-subgraph starting at <next>: */ if (!check_usage(curr, prev, next, bit, exclusive_bit(bit), state_name(bit))) return 0; bit++; /* _READ */ /* * Prove that the new dependency does not connect a hardirq-safe-read * lock with a hardirq-unsafe lock - to achieve this we search * the backwards-subgraph starting at <prev>, and the * forwards-subgraph starting at <next>: */ if (!check_usage(curr, prev, next, bit, exclusive_bit(bit), state_name(bit))) return 0; return 1; } static int check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { #define LOCKDEP_STATE(__STATE) \ if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ return 0; #include "lockdep_states.h" #undef LOCKDEP_STATE return 1; } static void inc_chains(void) { if (current->hardirq_context) nr_hardirq_chains++; else { if (current->softirq_context) nr_softirq_chains++; else nr_process_chains++; } } #else static inline int check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { return 1; } static inline void inc_chains(void) { nr_process_chains++; } #endif static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=============================================\n"); printk( "[ INFO: possible recursive locking detected ]\n"); print_kernel_version(); printk( "---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); printk("\nbut task is already holding lock:\n"); print_lock(prev); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); return 0; } /* * Check whether we are holding such a class already. * * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int check_deadlock(struct task_struct *curr, struct held_lock *next, struct lockdep_map *next_instance, int read) { struct held_lock *prev; struct held_lock *nest = NULL; int i; for (i = 0; i < curr->lockdep_depth; i++) { prev = curr->held_locks + i; if (prev->instance == next->nest_lock) nest = prev; if (hlock_class(prev) != hlock_class(next)) continue; /* * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((read == 2) && prev->read) return 2; /* * We're holding the nest_lock, which serializes this lock's * nesting behaviour. */ if (nest) return 2; return print_deadlock_bug(curr, prev, next); } return 1; } /* * There was a chain-cache miss, and we are about to add a new dependency * to a previous lock. We recursively validate the following rules: * * - would the adding of the <prev> -> <next> dependency create a * circular dependency in the graph? [== circular deadlock] * * - does the new prev->next dependency connect any hardirq-safe lock * (in the full backwards-subgraph starting at <prev>) with any * hardirq-unsafe lock (in the full forwards-subgraph starting at * <next>)? [== illegal lock inversion with hardirq contexts] * * - does the new prev->next dependency connect any softirq-safe lock * (in the full backwards-subgraph starting at <prev>) with any * softirq-unsafe lock (in the full forwards-subgraph starting at * <next>)? [== illegal lock inversion with softirq contexts] * * any of these scenarios could lead to a deadlock. * * Then if all the validations pass, we add the forwards and backwards * dependency. */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, int distance) { struct lock_list *entry; int ret; /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by * forward-recursing into the graph starting at <next>, and * checking whether we can reach <prev>.) * * We are using global variables to control the recursion, to * keep the stackframe size of the recursive functions low: */ check_source = next; check_target = prev; if (!(check_noncircular(hlock_class(next), 0))) return print_circular_bug_tail(); if (!check_prev_add_irq(curr, prev, next)) return 0; /* * For recursive read-locks we do all the dependency checks, * but we dont store read-triggered dependencies (only * write-triggered dependencies). This ensures that only the * write-side dependencies matter, and that if for example a * write-lock never takes any other locks, then the reads are * equivalent to a NOP. */ if (next->read == 2 || prev->read == 2) return 1; /* * Is the <prev> -> <next> dependency already present? * * (this may occur even though this is a new chain: consider * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 * chains - the second one will be new, but L1 already has * L2 added to its dependency list, due to the first chain.) */ list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; return 2; } } /* * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(prev)->locks_after, next->acquire_ip, distance); if (!ret) return 0; ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(next)->locks_before, next->acquire_ip, distance); if (!ret) return 0; /* * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); printk(" => "); print_lock_name(hlock_class(next)); printk("\n"); dump_stack(); return graph_lock(); } return 1; } /* * Add the dependency to all directly-previous locks that are 'relevant'. * The ones that are relevant are (in increasing distance from curr): * all consecutive trylock entries and the final non-trylock entry - or * the end of this context's lock-chain - whichever comes first. */ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; struct held_lock *hlock; /* * Debugging checks. * * Depth must not be zero for a non-head lock: */ if (!depth) goto out_bug; /* * At least two relevant locks must exist for this * to be a head: */ if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) goto out_bug; for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth-1; /* * Only non-recursive-read entries get new dependencies * added: */ if (hlock->read != 2) { if (!check_prev_add(curr, hlock, next, distance)) return 0; /* * Stop after the first non-trylock entry, * as non-trylock entries have added their * own direct dependencies already, so this * lock is connected to them indirectly: */ if (!hlock->trylock) break; } depth--; /* * End of lock-stack? */ if (!depth) break; /* * Stop the search if we cross into another context: */ if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; } return 1; out_bug: if (!debug_locks_off_graph_unlock()) return 0; WARN_ON(1); return 0; } unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { return lock_classes + chain_hlocks[chain->base + i]; } /* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is * validated. If the key is already hashed, return 0. * (On return with 1 graph_lock is held.) */ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; int i, j, n, cn; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; /* * We can walk it lock-free, because entries only get added * to the hash: */ list_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(&chain_lookup_hits); if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", (unsigned long long)chain_key, class->key, class->name); return 0; } } if (very_verbose(class)) printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", (unsigned long long)chain_key, class->key, class->name); /* * Allocate a new chain entry from the static array, and add * it to the hash: */ if (!graph_lock()) return 0; /* * We have to walk the chain again locked - to avoid duplicates: */ list_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; } } if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { if (!debug_locks_off_graph_unlock()) return 0; printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); printk("turning off the locking correctness validator.\n"); return 0; } chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; /* Find the first held_lock of current chain */ hlock_next = hlock; for (i = curr->lockdep_depth - 1; i >= 0; i--) { hlock_curr = curr->held_locks + i; if (hlock_curr->irq_context != hlock_next->irq_context) break; hlock_next = hlock; } i++; chain->depth = curr->lockdep_depth + 1 - i; cn = nr_chain_hlocks; while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); if (n == cn) break; cn = n; } if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = cn; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; } list_add_tail_rcu(&chain->entry, hash_head); debug_atomic_inc(&chain_lookup_misses); inc_chains(); return 1; } static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, struct held_lock *hlock, int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it * does not add new dependencies, because trylock can be done * in any order. * * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ if (!hlock->trylock && (hlock->check == 2) && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: * * - is irq-safe, if this lock is irq-unsafe * - is softirq-safe, if this lock is hardirq-unsafe * * And check whether the new lock's dependency graph * could lead back to the previous lock. * * any of these scenarios could lead to a deadlock. If * All validations */ int ret = check_deadlock(curr, hlock, lock, hlock->read); if (!ret) return 0; /* * Mark recursive read, as we jump over it when * building dependencies (just like we jump over * trylock entries): */ if (ret == 2) hlock->read = 2; /* * Add dependency only if this lock is not the head * of the chain, and if it's not a secondary read-lock: */ if (!chain_head && ret != 2) if (!check_prevs_add(curr, hlock)) return 0; graph_unlock(); } else /* after lookup_chain_cache(): */ if (unlikely(!debug_locks)) return 0; return 1; } #else static inline int validate_chain(struct task_struct *curr, struct lockdep_map *lock, struct held_lock *hlock, int chain_head, u64 chain_key) { return 1; } #endif /* * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i, id; u64 chain_key = 0; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)hlock->prev_chain_key); return; } id = hlock->class_idx - 1; if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; chain_key = iterate_chain_key(chain_key, id); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { debug_locks_off(); WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)curr->curr_chain_key); } #endif } static int print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=================================\n"); printk( "[ INFO: inconsistent lock state ]\n"); print_kernel_version(); printk( "---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, trace_hardirqs_enabled(curr), trace_softirqs_enabled(curr)); print_lock(this); printk("{%s} state was registered at:\n", usage_str[prev_bit]); print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); return 0; } /* * Print out an error if an invalid bit is set: */ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) return print_usage_bug(curr, this, bad_bit, new_bit); return 1; } static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: */ static int print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, struct held_lock *this, int forwards, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=========================================================\n"); printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); print_kernel_version(); printk( "---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); else printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); print_lock_name(other); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); printk("\nthe first lock's dependencies:\n"); print_lock_dependencies(hlock_class(this), 0); printk("\nthe second lock's dependencies:\n"); print_lock_dependencies(other, 0); printk("\nstack backtrace:\n"); dump_stack(); return 0; } /* * Prove that in the forwards-direction subgraph starting at <this> * there is no lock matching <mask>: */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; find_usage_bit = bit; /* fills in <forwards_match> */ ret = find_usage_forwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); } /* * Prove that in the backwards-direction subgraph starting at <this> * there is no lock matching <mask>: */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; find_usage_bit = bit; /* fills in <backwards_match> */ ret = find_usage_backwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); } void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); print_ip_sym(curr->hardirq_enable_ip); printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); print_ip_sym(curr->hardirq_disable_ip); printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); print_ip_sym(curr->softirq_enable_ip); printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); print_ip_sym(curr->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) { #if HARDIRQ_VERBOSE return class_filter(class); #endif return 0; } static int SOFTIRQ_verbose(struct lock_class *class) { #if SOFTIRQ_VERBOSE return class_filter(class); #endif return 0; } static int RECLAIM_FS_verbose(struct lock_class *class) { #if RECLAIM_VERBOSE return class_filter(class); #endif return 0; } #define STRICT_READ_CHECKS 1 static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, #include "lockdep_states.h" #undef LOCKDEP_STATE }; static inline int state_verbose(enum lock_usage_bit bit, struct lock_class *class) { return state_verbose_f[bit >> 2](class); } typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, enum lock_usage_bit bit, const char *name); static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit) { int excl_bit = exclusive_bit(new_bit); int read = new_bit & 1; int dir = new_bit & 2; /* * mark USED_IN has to look forwards -- to ensure no dependency * has ENABLED state, which would allow recursion deadlocks. * * mark ENABLED has to look backwards -- to ensure no dependee * has USED_IN state, which, again, would allow recursion deadlocks. */ check_usage_f usage = dir ? check_usage_backwards : check_usage_forwards; /* * Validate that this particular lock does not have conflicting * usage states. */ if (!valid_state(curr, this, new_bit, excl_bit)) return 0; /* * Validate that the lock dependencies don't have conflicting usage * states. */ if ((!read || !dir || STRICT_READ_CHECKS) && !usage(curr, this, excl_bit, state_name(new_bit))) return 0; /* * Check for read in write conflicts */ if (!read) { if (!valid_state(curr, this, new_bit, excl_bit + 1)) return 0; if (STRICT_READ_CHECKS && !usage(curr, this, excl_bit + 1, state_name(new_bit + 1))) return 0; } if (state_verbose(new_bit, hlock_class(this))) return 2; return 1; } enum mark_type { #define LOCKDEP_STATE(__STATE) __STATE, #include "lockdep_states.h" #undef LOCKDEP_STATE }; /* * Mark all held locks with a usage bit: */ static int mark_held_locks(struct task_struct *curr, enum mark_type mark) { enum lock_usage_bit usage_bit; struct held_lock *hlock; int i; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; usage_bit = 2 + (mark << 2); /* ENABLED */ if (hlock->read) usage_bit += 1; /* READ */ BUG_ON(usage_bit >= LOCK_USAGE_STATES); if (!mark_lock(curr, hlock, usage_bit)) return 0; } return 1; } /* * Debugging helper: via this flag we know that we are in * 'early bootup code', and will warn about any invalid irqs-on event: */ static int early_boot_irqs_enabled; void early_boot_irqs_off(void) { early_boot_irqs_enabled = 0; } void early_boot_irqs_on(void) { early_boot_irqs_enabled = 1; } /* * Hardirqs will be enabled: */ void trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; time_hardirqs_on(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) return; if (unlikely(curr->hardirqs_enabled)) { debug_atomic_inc(&redundant_hardirqs_on); return; } /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ if (!mark_held_locks(curr, HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage * bit for all held locks. (disabled hardirqs prevented * this bit from being set before) */ if (curr->softirqs_enabled) if (!mark_held_locks(curr, SOFTIRQ)) return; curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } EXPORT_SYMBOL(trace_hardirqs_on_caller); void trace_hardirqs_on(void) { trace_hardirqs_on_caller(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; time_hardirqs_off(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; if (curr->hardirqs_enabled) { /* * We have done an ON -> OFF transition: */ curr->hardirqs_enabled = 0; curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); } else debug_atomic_inc(&redundant_hardirqs_off); } EXPORT_SYMBOL(trace_hardirqs_off_caller); void trace_hardirqs_off(void) { trace_hardirqs_off_caller(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); /* * Softirqs will be enabled: */ void trace_softirqs_on(unsigned long ip) { struct task_struct *curr = current; if (unlikely(!debug_locks)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; if (curr->softirqs_enabled) { debug_atomic_inc(&redundant_softirqs_on); return; } /* * We'll do an OFF -> ON transition: */ curr->softirqs_enabled = 1; curr->softirq_enable_ip = ip; curr->softirq_enable_event = ++curr->irq_events; debug_atomic_inc(&softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are * enabled too: */ if (curr->hardirqs_enabled) mark_held_locks(curr, SOFTIRQ); } /* * Softirqs were disabled: */ void trace_softirqs_off(unsigned long ip) { struct task_struct *curr = current; if (unlikely(!debug_locks)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; if (curr->softirqs_enabled) { /* * We have done an ON -> OFF transition: */ curr->softirqs_enabled = 0; curr->softirq_disable_ip = ip; curr->softirq_disable_event = ++curr->irq_events; debug_atomic_inc(&softirqs_off_events); DEBUG_LOCKS_WARN_ON(!softirq_count()); } else debug_atomic_inc(&redundant_softirqs_off); } static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) { struct task_struct *curr = current; if (unlikely(!debug_locks)) return; /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_WAIT)) return; /* this guy won't enter reclaim */ if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) return; /* We're only interested __GFP_FS allocations for now */ if (!(gfp_mask & __GFP_FS)) return; if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; mark_held_locks(curr, RECLAIM_FS); } static void check_flags(unsigned long flags); void lockdep_trace_alloc(gfp_t gfp_mask) { unsigned long flags; if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; __lockdep_trace_alloc(gfp_mask, flags); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* * If non-trylock use in a hardirq or softirq context, then * mark the lock as used in these contexts: */ if (!hlock->trylock) { if (hlock->read) { if (curr->hardirq_context) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; if (curr->softirq_context) if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { if (curr->hardirq_context) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) return 0; } } if (!hlock->hardirqs_off) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ_READ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, LOCK_ENABLED_SOFTIRQ_READ)) return 0; } else { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, LOCK_ENABLED_SOFTIRQ)) return 0; } } /* * We reuse the irq context infrastructure more broadly as a general * context checking code. This tests GFP_FS recursion (a lock taken * during reclaim for a GFP_FS allocation is held over a GFP_FS * allocation). */ if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) return 0; } else { if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) return 0; } } return 1; } static int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { unsigned int depth = curr->lockdep_depth; /* * Keep track of points where we cross into an interrupt context: */ hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + curr->softirq_context; if (depth) { struct held_lock *prev_hlock; prev_hlock = curr->held_locks + depth-1; /* * If we cross into another context, reset the * hash key (this also prevents the checking and the * adding of the dependency to 'prev'): */ if (prev_hlock->irq_context != hlock->irq_context) return 1; } return 0; } #else static inline int mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { WARN_ON(1); return 1; } static inline int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { return 1; } static inline int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { return 0; } void lockdep_trace_alloc(gfp_t gfp_mask) { } #endif /* * Mark a lock with a usage bit, and validate the state transition: */ static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; /* * If already set then do not dirty the cacheline, * nor do any checks: */ if (likely(hlock_class(this)->usage_mask & new_mask)) return 1; if (!graph_lock()) return 0; /* * Make sure we didnt race: */ if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); return 1; } hlock_class(this)->usage_mask |= new_mask; if (!save_trace(hlock_class(this)->usage_traces + new_bit)) return 0; switch (new_bit) { #define LOCKDEP_STATE(__STATE) \ case LOCK_USED_IN_##__STATE: \ case LOCK_USED_IN_##__STATE##_READ: \ case LOCK_ENABLED_##__STATE: \ case LOCK_ENABLED_##__STATE##_READ: #include "lockdep_states.h" #undef LOCKDEP_STATE ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; break; case LOCK_USED: debug_atomic_dec(&nr_unused_locks); break; default: if (!debug_locks_off_graph_unlock()) return 0; WARN_ON(1); return 0; } graph_unlock(); /* * We must printk outside of the graph_lock: */ if (ret == 2) { printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); dump_stack(); } return ret; } /* * Initialize a lock instance's lock-class mapping info: */ void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { if (unlikely(!debug_locks)) return; if (DEBUG_LOCKS_WARN_ON(!key)) return; if (DEBUG_LOCKS_WARN_ON(!name)) return; /* * Sanity check, the lock-class key must be persistent: */ if (!static_obj(key)) { printk("BUG: key %p not in .data!\n", key); DEBUG_LOCKS_WARN_ON(1); return; } lock->name = name; lock->key = key; lock->class_cache = NULL; #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif if (subclass) register_lock_class(lock, subclass, 1); } EXPORT_SYMBOL_GPL(lockdep_init_map); /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; unsigned int depth, id; int chain_head = 0; u64 chain_key; if (!prove_locking) check = 1; if (unlikely(!debug_locks)) return 0; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); printk("turning off the locking correctness validator.\n"); return 0; } if (!subclass) class = lock->class_cache; /* * Not cached yet or subclass? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); if (!class) return 0; } debug_atomic_inc((atomic_t *)&class->ops); if (very_verbose(class)) { printk("\nacquire class [%p] %s", class->key, class->name); if (class->name_version > 1) printk("#%d", class->name_version); printk("\n"); dump_stack(); } /* * Add the lock to the list of currently held locks. * (we dont increase the depth just yet, up until the * dependency checks are done) */ depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; hlock = curr->held_locks + depth; if (DEBUG_LOCKS_WARN_ON(!class)) return 0; hlock->class_idx = class - lock_classes + 1; hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; hlock->trylock = trylock; hlock->read = read; hlock->check = check; hlock->hardirqs_off = !!hardirqs_off; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ if (!mark_lock(curr, hlock, LOCK_USED)) return 0; /* * Calculate the chain hash: it's the combined hash of all the * lock keys along the dependency chain. We save the hash value * at every step so that we can get the current hash easily * after unlock. The chain hash is then used to cache dependency * results. * * The 'key ID' is what is the most compact key value to drive * the hash, not class->key. */ id = class - lock_classes; if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; if (!depth) { if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { chain_key = 0; chain_head = 1; } chain_key = iterate_chain_key(chain_key, id); if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); #ifdef CONFIG_DEBUG_LOCKDEP if (unlikely(!debug_locks)) return 0; #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); printk("BUG: MAX_LOCK_DEPTH too low!\n"); printk("turning off the locking correctness validator.\n"); return 0; } if (unlikely(curr->lockdep_depth > max_lockdep_depth)) max_lockdep_depth = curr->lockdep_depth; return 1; } static int print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (!debug_locks_off()) return 0; if (debug_locks_silent) return 0; printk("\n=====================================\n"); printk( "[ BUG: bad unlock balance detected! ]\n"); printk( "-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); printk(") at:\n"); print_ip_sym(ip); printk("but there are no more locks to release!\n"); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); return 0; } /* * Common debugging checks for both nested and non-nested unlock: */ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (unlikely(!debug_locks)) return 0; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; if (curr->lockdep_depth <= 0) return print_unlock_inbalance_bug(curr, lock, ip); return 1; } static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; struct lock_class *class; unsigned int depth; int i; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; prev_hlock = NULL; for (i = depth-1; i >= 0; i--) { hlock = curr->held_locks + i; /* * We must not cross into another context: */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; if (hlock->instance == lock) goto found_it; prev_hlock = hlock; } return print_unlock_inbalance_bug(curr, lock, ip); found_it: lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes + 1; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; for (; i < depth; i++) { hlock = curr->held_locks + i; if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip)) return 0; } if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; return 1; } /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a * relatively rare operation, as all the unlock APIs default * to nested mode (which uses lock_release()): */ static int lock_release_non_nested(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { struct held_lock *hlock, *prev_hlock; unsigned int depth; int i; /* * Check whether the lock exists in the current stack * of held locks: */ depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; prev_hlock = NULL; for (i = depth-1; i >= 0; i--) { hlock = curr->held_locks + i; /* * We must not cross into another context: */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; if (hlock->instance == lock) goto found_it; prev_hlock = hlock; } return print_unlock_inbalance_bug(curr, lock, ip); found_it: lock_release_holdtime(hlock); /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other * entries (if any), recalculating the hash along the way: */ curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; for (i++; i < depth; i++) { hlock = curr->held_locks + i; if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip)) return 0; } if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) return 0; return 1; } /* * Remove the lock to the list of currently held locks - this gets * called on mutex_unlock()/spin_unlock*() (or on a failed * mutex_lock_interruptible()). This is done for unlocks that nest * perfectly. (i.e. the current top of the lock-stack is unlocked) */ static int lock_release_nested(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { struct held_lock *hlock; unsigned int depth; /* * Pop off the top of the lock stack: */ depth = curr->lockdep_depth - 1; hlock = curr->held_locks + depth; /* * Is the unlock non-nested: */ if (hlock->instance != lock) return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) return 0; curr->curr_chain_key = hlock->prev_chain_key; lock_release_holdtime(hlock); #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; hlock->class_idx = 0; hlock->acquire_ip = 0; hlock->irq_context = 0; #endif return 1; } /* * Remove the lock to the list of currently held locks - this gets * called on mutex_unlock()/spin_unlock*() (or on a failed * mutex_lock_interruptible()). This is done for unlocks that nest * perfectly. (i.e. the current top of the lock-stack is unlocked) */ static void __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; if (!check_unlock(curr, lock, ip)) return; if (nested) { if (!lock_release_nested(curr, lock, ip)) return; } else { if (!lock_release_non_nested(curr, lock, ip)) return; } check_chain_key(curr); } /* * Check whether we follow the irq-flags state precisely: */ static void check_flags(unsigned long flags) { #if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) return; if (irqs_disabled_flags(flags)) { if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { printk("possible reason: unannotated irqs-off.\n"); } } else { if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { printk("possible reason: unannotated irqs-on.\n"); } } /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only * check if not in hardirq contexts: */ if (!hardirq_count()) { if (softirq_count()) DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); else DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } if (!debug_locks) print_irqtrace_events(current); #endif } void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip) { unsigned long flags; if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); current->lockdep_recursion = 1; check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_set_class); /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip) { unsigned long flags; if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquire); void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_release); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { current->lockdep_reclaim_gfp = gfp_mask; } void lockdep_clear_current_reclaim_state(void) { current->lockdep_reclaim_gfp = 0; } #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (!debug_locks_off()) return 0; if (debug_locks_silent) return 0; printk("\n=================================\n"); printk( "[ BUG: bad contention detected! ]\n"); printk( "---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); printk(") at:\n"); print_ip_sym(ip); printk("but there are no locks held!\n"); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); return 0; } static void __lock_contended(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; int i, contention_point, contending_point; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) return; prev_hlock = NULL; for (i = depth-1; i >= 0; i--) { hlock = curr->held_locks + i; /* * We must not cross into another context: */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; if (hlock->instance == lock) goto found_it; prev_hlock = hlock; } print_lock_contention_bug(curr, lock, ip); return; found_it: hlock->waittime_stamp = sched_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, lock->ip); stats = get_lock_stats(hlock_class(hlock)); if (contention_point < LOCKSTAT_POINTS) stats->contention_point[contention_point]++; if (contending_point < LOCKSTAT_POINTS) stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); } static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; u64 now; s64 waittime = 0; int i, cpu; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) return; prev_hlock = NULL; for (i = depth-1; i >= 0; i--) { hlock = curr->held_locks + i; /* * We must not cross into another context: */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; if (hlock->instance == lock) goto found_it; prev_hlock = hlock; } print_lock_contention_bug(curr, lock, _RET_IP_); return; found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { now = sched_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) lock_time_inc(&stats->read_waittime, waittime); else lock_time_inc(&stats->write_waittime, waittime); } if (lock->cpu != cpu) stats->bounces[bounce_acquired + !!hlock->read]++; put_lock_stats(stats); lock->cpu = cpu; lock->ip = ip; } void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; if (unlikely(!lock_stat)) return; if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; __lock_contended(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_contended); void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; if (unlikely(!lock_stat)) return; if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; __lock_acquired(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); #endif /* * Used by the testsuite, sanitize the validator state * after a simulated failure: */ void lockdep_reset(void) { unsigned long flags; int i; raw_local_irq_save(flags); current->curr_chain_key = 0; current->lockdep_depth = 0; current->lockdep_recursion = 0; memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) INIT_LIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } static void zap_class(struct lock_class *class) { int i; /* * Remove all dependencies this lock is * involved in: */ for (i = 0; i < nr_list_entries; i++) { if (list_entries[i].class == class) list_del_rcu(&list_entries[i].entry); } /* * Unhash the class and remove it from the all_lock_classes list: */ list_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); class->key = NULL; } static inline int within(const void *addr, void *start, unsigned long size) { return addr >= start && addr < start + size; } void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class, *next; struct list_head *head; unsigned long flags; int i; int locked; raw_local_irq_save(flags); locked = graph_lock(); /* * Unhash all classes that were created by this module: */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) zap_class(class); } } if (locked) graph_unlock(); raw_local_irq_restore(flags); } void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class, *next; struct list_head *head; unsigned long flags; int i, j; int locked; raw_local_irq_save(flags); /* * Remove all classes this lock might have: */ for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { /* * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); if (class) zap_class(class); } /* * Debug check: in the end all mapped classes should * be gone. */ locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { if (unlikely(class == lock->class_cache)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; } } } if (locked) graph_unlock(); out_restore: raw_local_irq_restore(flags); } void lockdep_init(void) { int i; /* * Some architectures have their own start_kernel() * code which calls lockdep_init(), while we also * call lockdep_init() from the start_kernel() itself, * and we want to initialize the hashes only once: */ if (lockdep_initialized) return; for (i = 0; i < CLASSHASH_SIZE; i++) INIT_LIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) INIT_LIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); printk(" memory used by lock dependency info: %lu kB\n", (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + sizeof(struct list_head) * CLASSHASH_SIZE + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } #endif } static void print_freed_lock_bug(struct task_struct *curr, const void *mem_from, const void *mem_to, struct held_lock *hlock) { if (!debug_locks_off()) return; if (debug_locks_silent) return; printk("\n=========================\n"); printk( "[ BUG: held lock freed! ]\n"); printk( "-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); } static inline int not_in_range(const void* mem_from, unsigned long mem_len, const void* lock_from, unsigned long lock_len) { return lock_from + lock_len <= mem_from || mem_from + mem_len <= lock_from; } /* * Called when kernel memory is freed (or unmapped), or if a lock * is destroyed or reinitialized - this code checks whether there is * any held lock in the memory range of <from> to <to>: */ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) { struct task_struct *curr = current; struct held_lock *hlock; unsigned long flags; int i; if (unlikely(!debug_locks)) return; local_irq_save(flags); for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; if (not_in_range(mem_from, mem_len, hlock->instance, sizeof(*hlock->instance))) continue; print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); break; } local_irq_restore(flags); } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; if (debug_locks_silent) return; printk("\n=====================================\n"); printk( "[ BUG: lock held at task exit time! ]\n"); printk( "-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", curr->comm, task_pid_nr(curr)); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); } void debug_check_no_locks_held(struct task_struct *task) { if (unlikely(task->lockdep_depth > 0)) print_held_locks_bug(task); } void debug_show_all_locks(void) { struct task_struct *g, *p; int count = 10; int unlock = 1; if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); return; } printk("\nShowing all locks held in the system:\n"); /* * Here we try to get the tasklist_lock as hard as possible, * if not successful after 2 seconds we ignore it (but keep * trying). This is to enable a debug printout even if a * tasklist_lock-holding task deadlocks or crashes. */ retry: if (!read_trylock(&tasklist_lock)) { if (count == 10) printk("hm, tasklist_lock locked, retrying... "); if (count) { count--; printk(" #%d", 10-count); mdelay(200); goto retry; } printk(" ignoring it.\n"); unlock = 0; } else { if (count != 10) printk(KERN_CONT " locked it.\n"); } do_each_thread(g, p) { /* * It's not reliable to print a task's held locks * if it's not sleeping (or if it's not the current * task): */ if (p->state == TASK_RUNNING && p != current) continue; if (p->lockdep_depth) lockdep_print_held_locks(p); if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; } while_each_thread(g, p); printk("\n"); printk("=============================================\n\n"); if (unlock) read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); /* * Careful: only use this function if you are sure that * the task cannot run in parallel! */ void __debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); return; } lockdep_print_held_locks(task); } EXPORT_SYMBOL_GPL(__debug_show_held_locks); void debug_show_held_locks(struct task_struct *task) { __debug_show_held_locks(task); } EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) { struct task_struct *curr = current; if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; printk("\n================================================\n"); printk( "[ BUG: lock held when returning to user space! ]\n"); printk( "------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } }