Index: 2.6-8xx/net/core/pktgen.c =================================================================== --- 2.6-8xx.orig/net/core/pktgen.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/core/pktgen.c 2005-06-16 13:45:08.000000000 -0300 @@ -503,7 +503,7 @@ static int pg_clone_skb_d = 0; static int debug = 0; -static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(_thread_lock); static struct pktgen_thread *pktgen_threads = NULL; static char module_fname[128]; Index: 2.6-8xx/net/core/dst.c =================================================================== --- 2.6-8xx.orig/net/core/dst.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/core/dst.c 2005-06-16 13:45:08.000000000 -0300 @@ -39,8 +39,7 @@ static void dst_run_gc(unsigned long); static void ___dst_free(struct dst_entry * dst); -static struct timer_list dst_gc_timer = - TIMER_INITIALIZER(dst_run_gc, DST_GC_MIN, 0); +static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0); static void dst_run_gc(unsigned long dummy) { Index: 2.6-8xx/net/core/netpoll.c =================================================================== --- 2.6-8xx.orig/net/core/netpoll.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/core/netpoll.c 2005-06-16 13:45:08.000000000 -0300 @@ -152,7 +152,9 @@ return; /* Process pending work on NIC */ + WARN_ON_RT(irqs_disabled()); np->dev->poll_controller(np->dev); + WARN_ON_RT(irqs_disabled()); if (np->dev->poll) poll_napi(np); @@ -179,28 +181,31 @@ static void zap_completion_queue(void) { - unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); + struct sk_buff *clist = NULL; + unsigned long flags; if (sd->completion_queue) { - struct sk_buff *clist; - local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); - - while (clist != NULL) { - struct sk_buff *skb = clist; - clist = clist->next; - if(skb->destructor) - dev_kfree_skb_any(skb); /* put this one back */ - else - __kfree_skb(skb); - } } + /* + * Took the list private, can drop our softnet + * reference: + */ put_cpu_var(softnet_data); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if(skb->destructor) + dev_kfree_skb_any(skb); /* put this one back */ + else + __kfree_skb(skb); + } } static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) @@ -263,7 +268,7 @@ } spin_lock(&np->dev->xmit_lock); - np->dev->xmit_lock_owner = smp_processor_id(); + np->dev->xmit_lock_owner = _smp_processor_id(); /* * network drivers do not expect to be called if the queue is @@ -612,7 +617,7 @@ struct net_device *ndev = NULL; struct in_device *in_dev; - np->poll_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&np->poll_lock); np->poll_owner = -1; if (np->dev_name) Index: 2.6-8xx/net/core/dev.c =================================================================== --- 2.6-8xx.orig/net/core/dev.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/core/dev.c 2005-06-16 13:45:08.000000000 -0300 @@ -161,7 +161,7 @@ #ifdef OFFLINE_SAMPLE static void sample_queue(unsigned long dummy); -static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); +static DEFINE_TIMER(samp_timer, sample_queue, 0, 0); #endif /* @@ -1318,10 +1318,16 @@ Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ + int cpu = _smp_processor_id(); /* ok because BHs are off */ + /* + * No need to check for recursion with threaded interrupts: + */ +#ifdef CONFIG_PREEMPT_RT + if (1) { +#else if (dev->xmit_lock_owner != cpu) { - +#endif HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { @@ -1546,6 +1552,11 @@ BUG_TRAP(!atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ + cond_resched_all(); } } @@ -1568,10 +1579,17 @@ qdisc_run(dev); spin_unlock(&dev->queue_lock); } else { - netif_schedule(dev); + /* + * Dont re-kick the queue here, it will cause + * excessive scheduling of ksoftirqd due + * to retry. When the queue is released + * it will be completed anyway. + */ +// netif_schedule(dev); } } } + } static __inline__ int deliver_skb(struct sk_buff *skb, @@ -1788,12 +1806,13 @@ static void net_rx_action(struct softirq_action *h) { - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; int budget = netdev_max_backlog; local_irq_disable(); + queue = &__get_cpu_var(softnet_data); while (!list_empty(&queue->poll_list)) { struct net_device *dev; @@ -1802,6 +1821,10 @@ goto softnet_break; local_irq_enable(); + if (unlikely(cond_resched_all())) { + local_irq_disable(); + continue; + } dev = list_entry(queue->poll_list.next, struct net_device, poll_list); @@ -1827,8 +1850,10 @@ return; softnet_break: + preempt_disable(); __get_cpu_var(netdev_rx_stat).time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); + preempt_enable(); goto out; } Index: 2.6-8xx/net/core/sock.c =================================================================== --- 2.6-8xx.orig/net/core/sock.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/core/sock.c 2005-06-16 13:45:08.000000000 -0300 @@ -1119,7 +1119,7 @@ { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk,1,POLL_IN); read_unlock(&sk->sk_callback_lock); } Index: 2.6-8xx/net/atm/signaling.c =================================================================== --- 2.6-8xx.orig/net/atm/signaling.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/atm/signaling.c 2005-06-16 13:45:08.000000000 -0300 @@ -261,7 +261,7 @@ .ops = &sigd_dev_ops, .type = "sig", .number = 999, - .lock = SPIN_LOCK_UNLOCKED + .lock = SPIN_LOCK_UNLOCKED(sigd_dev.lock) }; Index: 2.6-8xx/net/atm/mpc.c =================================================================== --- 2.6-8xx.orig/net/atm/mpc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/atm/mpc.c 2005-06-16 13:45:08.000000000 -0300 @@ -105,7 +105,7 @@ struct mpoa_client *mpcs = NULL; /* FIXME */ static struct atm_mpoa_qos *qos_head = NULL; -static struct timer_list mpc_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(mpc_timer, NULL, 0, 0); static struct mpoa_client *find_mpc_by_itfnum(int itf) @@ -749,7 +749,7 @@ .ops = &mpc_ops, .type = "mpc", .number = 42, - .lock = SPIN_LOCK_UNLOCKED + .lock = SPIN_LOCK_UNLOCKED(mpc_dev.lock) /* members not explicitly initialised will be 0 */ }; Index: 2.6-8xx/net/atm/lec.c =================================================================== --- 2.6-8xx.orig/net/atm/lec.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/atm/lec.c 2005-06-16 13:45:08.000000000 -0300 @@ -597,7 +597,7 @@ .ops = &lecdev_ops, .type = "lec", .number = 999, /* dummy device number */ - .lock = SPIN_LOCK_UNLOCKED + .lock = SPIN_LOCK_UNLOCKED(lecatm_dev.lock) }; /* Index: 2.6-8xx/net/atm/clip.c =================================================================== --- 2.6-8xx.orig/net/atm/clip.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/atm/clip.c 2005-06-16 13:45:08.000000000 -0300 @@ -707,7 +707,7 @@ .ops = &atmarpd_dev_ops, .type = "arpd", .number = 999, - .lock = SPIN_LOCK_UNLOCKED + .lock = SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) }; Index: 2.6-8xx/net/netrom/nr_loopback.c =================================================================== --- 2.6-8xx.orig/net/netrom/nr_loopback.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/netrom/nr_loopback.c 2005-06-16 13:45:08.000000000 -0300 @@ -17,7 +17,7 @@ static void nr_loopback_timer(unsigned long); static struct sk_buff_head loopback_queue; -static struct timer_list loopback_timer = TIMER_INITIALIZER(nr_loopback_timer, 0, 0); +static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0); void __init nr_loopback_init(void) { Index: 2.6-8xx/net/unix/af_unix.c =================================================================== --- 2.6-8xx.orig/net/unix/af_unix.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/unix/af_unix.c 2005-06-16 13:45:08.000000000 -0300 @@ -290,7 +290,7 @@ read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, 2, POLL_OUT); } read_unlock(&sk->sk_callback_lock); Index: 2.6-8xx/net/sched/sch_api.c =================================================================== --- 2.6-8xx.orig/net/sched/sch_api.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/sched/sch_api.c 2005-06-16 13:45:08.000000000 -0300 @@ -1205,7 +1205,7 @@ * with 32-bit get_cycles(). Safe up to 4GHz CPU. */ static void psched_tick(unsigned long); -static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); +static DEFINE_TIMER(psched_timer, psched_tick, 0, 0); static void psched_tick(unsigned long dummy) { Index: 2.6-8xx/net/sched/sch_generic.c =================================================================== --- 2.6-8xx.orig/net/sched/sch_generic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/sched/sch_generic.c 2005-06-16 13:45:08.000000000 -0300 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,10 @@ * will be requeued. */ if (!nolock) { +#ifdef CONFIG_PREEMPT_RT + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = _smp_processor_id(); +#else if (!spin_trylock(&dev->xmit_lock)) { collision: /* So, someone grabbed the driver. */ @@ -117,17 +122,19 @@ it by checking xmit owner and drop the packet when deadloop is detected. */ - if (dev->xmit_lock_owner == smp_processor_id()) { + if (dev->xmit_lock_owner == _smp_processor_id()) { kfree_skb(skb); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); + return -1; } __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } /* Remember that the driver is grabbed by us. */ - dev->xmit_lock_owner = smp_processor_id(); + dev->xmit_lock_owner = _smp_processor_id(); +#endif } { @@ -139,18 +146,34 @@ if (netdev_nit) dev_queue_xmit_nit(skb, dev); + WARN_ON_RT(irqs_disabled()); ret = dev->hard_start_xmit(skb, dev); +#ifdef CONFIG_PREEMPT_RT + if (irqs_disabled()) { + if (printk_ratelimit()) + print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit); + local_irq_enable(); + } +#endif if (ret == NETDEV_TX_OK) { if (!nolock) { dev->xmit_lock_owner = -1; spin_unlock(&dev->xmit_lock); } spin_lock(&dev->queue_lock); +#if defined(CONFIG_PREEMPT_RT) && 0 + preempt_disable(); + __get_cpu_var(netdev_rx_stat).cpu_collision++; + preempt_enable(); + goto requeue; +#else return -1; +#endif } if (ret == NETDEV_TX_LOCKED && nolock) { spin_lock(&dev->queue_lock); - goto collision; +// ugh, is this right. goto collision; + return -1; } } @@ -174,12 +197,16 @@ 3. device is buggy (ppp) */ +#ifndef CONFIG_PREEMPT_RT requeue: +#endif q->ops->requeue(skb, q); netif_schedule(dev); + return 1; } BUG_ON((int) q->q.qlen < 0); + return q->q.qlen; } Index: 2.6-8xx/net/decnet/dn_route.c =================================================================== --- 2.6-8xx.orig/net/decnet/dn_route.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/decnet/dn_route.c 2005-06-16 13:45:08.000000000 -0300 @@ -117,8 +117,7 @@ static unsigned dn_rt_hash_mask; static struct timer_list dn_route_timer; -static struct timer_list dn_rt_flush_timer = - TIMER_INITIALIZER(dn_run_flush, 0, 0); +static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0); int decnet_dst_gc_interval = 2; static struct dst_ops dn_dst_ops = { Index: 2.6-8xx/net/ipv6/ip6_flowlabel.c =================================================================== --- 2.6-8xx.orig/net/ipv6/ip6_flowlabel.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv6/ip6_flowlabel.c 2005-06-16 13:45:08.000000000 -0300 @@ -50,7 +50,7 @@ static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); -static struct timer_list ip6_fl_gc_timer = TIMER_INITIALIZER(ip6_fl_gc, 0, 0); +static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); /* FL hash table lock: it protects only of GC */ Index: 2.6-8xx/net/ipv6/ip6_fib.c =================================================================== --- 2.6-8xx.orig/net/ipv6/ip6_fib.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv6/ip6_fib.c 2005-06-16 13:45:08.000000000 -0300 @@ -92,7 +92,7 @@ static __u32 rt_sernum; -static struct timer_list ip6_fib_timer = TIMER_INITIALIZER(fib6_run_gc, 0, 0); +static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0); struct fib6_walker_t fib6_walker_list = { .prev = &fib6_walker_list, Index: 2.6-8xx/net/ipv6/addrconf.c =================================================================== --- 2.6-8xx.orig/net/ipv6/addrconf.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv6/addrconf.c 2005-06-16 13:45:08.000000000 -0300 @@ -122,8 +122,7 @@ static void addrconf_verify(unsigned long); -static struct timer_list addr_chk_timer = - TIMER_INITIALIZER(addrconf_verify, 0, 0); +static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); static DEFINE_SPINLOCK(addrconf_verify_lock); static void addrconf_join_anycast(struct inet6_ifaddr *ifp); Index: 2.6-8xx/net/sunrpc/sched.c =================================================================== --- 2.6-8xx.orig/net/sunrpc/sched.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/sunrpc/sched.c 2005-06-16 13:45:08.000000000 -0300 @@ -135,8 +135,6 @@ static void rpc_delete_timer(struct rpc_task *task) { - if (RPC_IS_QUEUED(task)) - return; if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { del_singleshot_timer_sync(&task->tk_timer); dprintk("RPC: %4d deleting timer\n", task->tk_pid); @@ -337,6 +335,8 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, rpc_action timer) { + BUG_ON(test_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate) != 0 || + timer_pending(&task->tk_timer)); /* * Protect the queue operations. */ @@ -566,7 +566,6 @@ BUG_ON(RPC_IS_QUEUED(task)); - restarted: while (1) { /* * Garbage collection of pending timers... @@ -594,6 +593,8 @@ unlock_kernel(); } + BUG_ON(test_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate) != 0 || + timer_pending(&task->tk_timer)); /* * Perform the next FSM step. * tk_action may be NULL when the task has been killed @@ -607,6 +608,7 @@ unlock_kernel(); } + restarted: /* * Lockless check for whether task is sleeping or not. */ @@ -925,6 +927,8 @@ void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) { + BUG_ON(test_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate) != 0 || + timer_pending(&task->tk_timer)); spin_lock_bh(&childq.lock); /* N.B. Is it possible for the child to have already finished? */ __rpc_sleep_on(&childq, task, func, NULL); Index: 2.6-8xx/net/sunrpc/clnt.c =================================================================== --- 2.6-8xx.orig/net/sunrpc/clnt.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/sunrpc/clnt.c 2005-06-16 13:45:08.000000000 -0300 @@ -232,7 +232,8 @@ clnt->cl_oneshot = 0; clnt->cl_dead = 0; rpc_killall_tasks(clnt); - sleep_on_timeout(&destroy_wait, 1*HZ); + wait_event_timeout(destroy_wait, + atomic_read(&clnt->cl_users) > 0, 1*HZ); } if (atomic_read(&clnt->cl_users) < 0) { Index: 2.6-8xx/net/ipv4/netfilter/ip_tables.c =================================================================== --- 2.6-8xx.orig/net/ipv4/netfilter/ip_tables.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv4/netfilter/ip_tables.c 2005-06-16 13:45:08.000000000 -0300 @@ -111,7 +111,11 @@ static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) +/* + * Use atomic add because on PREEMPT_RT the same table might + * be used on two CPUs at once: + */ +#define ADD_COUNTER(c,b,p) do { atomic_add((b), (atomic_t *)(&(c).bcnt)); atomic_add((p), (atomic_t *)(&(c).pcnt)); } while(0) #ifdef CONFIG_SMP #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) @@ -290,8 +294,17 @@ read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + /* + * on a PREEMPT_RT kernel the task could schedule + * off and smp_processor_id() is not safe. So we take + * the current value of the CPU and use that table. We + * only update the counters while read-locking the table + * and dont change the rules so the possibility of the + * same table being used by two tasks at once is not a + * problem. + */ table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + + TABLE_OFFSET(table->private, _smp_processor_id()); e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -299,7 +312,7 @@ if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", - smp_processor_id(), + _smp_processor_id(), table->name, &((struct ipt_entry *)table_base)->comefrom, ((struct ipt_entry *)table_base)->comefrom); Index: 2.6-8xx/net/ipv4/inetpeer.c =================================================================== --- 2.6-8xx.orig/net/ipv4/inetpeer.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv4/inetpeer.c 2005-06-16 13:45:08.000000000 -0300 @@ -99,8 +99,7 @@ #define PEER_MAX_CLEANUP_WORK 30 static void peer_check_expire(unsigned long dummy); -static struct timer_list peer_periodic_timer = - TIMER_INITIALIZER(peer_check_expire, 0, 0); +static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); /* Exported for sysctl_net_ipv4. */ int inet_peer_gc_mintime = 10 * HZ, Index: 2.6-8xx/net/ipv4/tcp_minisocks.c =================================================================== --- 2.6-8xx.orig/net/ipv4/tcp_minisocks.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv4/tcp_minisocks.c 2005-06-16 13:45:08.000000000 -0300 @@ -420,7 +420,7 @@ static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; static DEFINE_SPINLOCK(tw_death_lock); -static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); +static DEFINE_TIMER(tcp_tw_timer, tcp_twkill, 0, 0); static void twkill_work(void *); static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); static u32 twkill_thread_slots; @@ -549,8 +549,7 @@ static int tcp_twcal_hand = -1; static int tcp_twcal_jiffie; static void tcp_twcal_tick(unsigned long); -static struct timer_list tcp_twcal_timer = - TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); +static DEFINE_TIMER(tcp_twcal_timer, tcp_twcal_tick, 0, 0); static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) Index: 2.6-8xx/net/ipv4/tcp_ipv4.c =================================================================== --- 2.6-8xx.orig/net/ipv4/tcp_ipv4.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/net/ipv4/tcp_ipv4.c 2005-06-16 13:45:08.000000000 -0300 @@ -93,7 +93,7 @@ .__tcp_lhash_users = ATOMIC_INIT(0), .__tcp_lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), - .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED + .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED(tcp_hashinfo.__tcp_portalloc_lock) }; /* Index: 2.6-8xx/sound/core/pcm_lib.c =================================================================== --- 2.6-8xx.orig/sound/core/pcm_lib.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/sound/core/pcm_lib.c 2005-06-16 13:45:08.000000000 -0300 @@ -133,6 +133,7 @@ snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN); #ifdef CONFIG_SND_DEBUG if (substream->pstr->xrun_debug) { + user_trace_stop(); snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n", substream->pcm->card->number, substream->pcm->device, Index: 2.6-8xx/sound/oss/sys_timer.c =================================================================== --- 2.6-8xx.orig/sound/oss/sys_timer.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/sound/oss/sys_timer.c 2005-06-16 13:45:08.000000000 -0300 @@ -28,8 +28,7 @@ static void poll_def_tmr(unsigned long dummy); static DEFINE_SPINLOCK(lock); - -static struct timer_list def_tmr = TIMER_INITIALIZER(poll_def_tmr, 0, 0); +static DEFINE_TIMER(def_tmr, poll_def_tmr, 0, 0); static unsigned long tmr2ticks(int tmr_value) Index: 2.6-8xx/sound/oss/midibuf.c =================================================================== --- 2.6-8xx.orig/sound/oss/midibuf.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/sound/oss/midibuf.c 2005-06-16 13:45:08.000000000 -0300 @@ -50,7 +50,7 @@ static void midi_poll(unsigned long dummy); -static struct timer_list poll_timer = TIMER_INITIALIZER(midi_poll, 0, 0); +static DEFINE_TIMER(poll_timer, midi_poll, 0, 0); static volatile int open_devs; static DEFINE_SPINLOCK(lock); Index: 2.6-8xx/sound/oss/uart6850.c =================================================================== --- 2.6-8xx.orig/sound/oss/uart6850.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/sound/oss/uart6850.c 2005-06-16 13:45:08.000000000 -0300 @@ -78,8 +78,7 @@ static void poll_uart6850(unsigned long dummy); -static struct timer_list uart6850_timer = - TIMER_INITIALIZER(poll_uart6850, 0, 0); +static DEFINE_TIMER(uart6850_timer, poll_uart6850, 0, 0); static void uart6850_input_loop(void) { Index: 2.6-8xx/sound/oss/soundcard.c =================================================================== --- 2.6-8xx.orig/sound/oss/soundcard.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/sound/oss/soundcard.c 2005-06-16 13:45:08.000000000 -0300 @@ -682,8 +682,7 @@ } -static struct timer_list seq_timer = - TIMER_INITIALIZER(do_sequencer_timer, 0, 0); +static DEFINE_TIMER(seq_timer, do_sequencer_timer, 0, 0); void request_sound_timer(int count) { Index: 2.6-8xx/sound/oss/dmasound/dmasound_core.c =================================================================== --- 2.6-8xx.orig/sound/oss/dmasound/dmasound_core.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/sound/oss/dmasound/dmasound_core.c 2005-06-16 13:45:08.000000000 -0300 @@ -230,7 +230,7 @@ * Mid level stuff */ -struct sound_settings dmasound = { .lock = SPIN_LOCK_UNLOCKED }; +struct sound_settings dmasound = { .lock = SPIN_LOCK_UNLOCKED(dmasound.lock) }; static inline void sound_silence(void) { Index: 2.6-8xx/fs/xfs/linux-2.6/mutex.h =================================================================== --- 2.6-8xx.orig/fs/xfs/linux-2.6/mutex.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/linux-2.6/mutex.h 2005-06-16 13:45:08.000000000 -0300 @@ -42,7 +42,7 @@ * callers. */ #define MUTEX_DEFAULT 0x0 -typedef struct semaphore mutex_t; +typedef struct compat_semaphore mutex_t; #define mutex_init(lock, type, name) sema_init(lock, 1) #define mutex_destroy(lock) sema_init(lock, -99) Index: 2.6-8xx/fs/xfs/linux-2.6/xfs_buf.c =================================================================== --- 2.6-8xx.orig/fs/xfs/linux-2.6/xfs_buf.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/linux-2.6/xfs_buf.c 2005-06-16 13:45:08.000000000 -0300 @@ -976,7 +976,7 @@ pagebuf_lock_value( xfs_buf_t *pb) { - return(atomic_read(&pb->pb_sema.count)); + return !sem_is_locked(&pb->pb_sema); } #endif Index: 2.6-8xx/fs/xfs/linux-2.6/xfs_buf.h =================================================================== --- 2.6-8xx.orig/fs/xfs/linux-2.6/xfs_buf.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/linux-2.6/xfs_buf.h 2005-06-16 13:45:08.000000000 -0300 @@ -138,7 +138,7 @@ #define PB_PAGES 2 typedef struct xfs_buf { - struct semaphore pb_sema; /* semaphore for lockables */ + struct compat_semaphore pb_sema; /* semaphore for lockables */ unsigned long pb_queuetime; /* time buffer was queued */ atomic_t pb_pin_count; /* pin count */ wait_queue_head_t pb_waiters; /* unpin waiters */ @@ -158,7 +158,7 @@ page_buf_iodone_t pb_iodone; /* I/O completion function */ page_buf_relse_t pb_relse; /* releasing function */ page_buf_bdstrat_t pb_strat; /* pre-write function */ - struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ + struct compat_semaphore pb_iodonesema; /* Semaphore for I/O waiters */ void *pb_fspriv; void *pb_fspriv2; void *pb_fspriv3; Index: 2.6-8xx/fs/xfs/linux-2.6/mrlock.h =================================================================== --- 2.6-8xx.orig/fs/xfs/linux-2.6/mrlock.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/linux-2.6/mrlock.h 2005-06-16 13:45:08.000000000 -0300 @@ -37,12 +37,12 @@ enum { MR_NONE, MR_ACCESS, MR_UPDATE }; typedef struct { - struct rw_semaphore mr_lock; - int mr_writer; + struct compat_rw_semaphore mr_lock; + int mr_writer; } mrlock_t; #define mrinit(mrp, name) \ - ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) ) + do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) #define mraccess(mrp) mraccessf(mrp, 0) Index: 2.6-8xx/fs/xfs/linux-2.6/sema.h =================================================================== --- 2.6-8xx.orig/fs/xfs/linux-2.6/sema.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/linux-2.6/sema.h 2005-06-16 13:45:08.000000000 -0300 @@ -41,7 +41,7 @@ * sema_t structure just maps to struct semaphore in Linux kernel. */ -typedef struct semaphore sema_t; +typedef struct compat_semaphore sema_t; #define init_sema(sp, val, c, d) sema_init(sp, val) #define initsema(sp, val) sema_init(sp, val) Index: 2.6-8xx/fs/xfs/support/ktrace.c =================================================================== --- 2.6-8xx.orig/fs/xfs/support/ktrace.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/support/ktrace.c 2005-06-16 13:45:08.000000000 -0300 @@ -170,7 +170,7 @@ void *val14, void *val15) { - static lock_t wrap_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(wrap_lock); unsigned long flags; int index; ktrace_entry_t *ktep; Index: 2.6-8xx/fs/xfs/xfs_mount.h =================================================================== --- 2.6-8xx.orig/fs/xfs/xfs_mount.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/xfs_mount.h 2005-06-16 13:45:08.000000000 -0300 @@ -340,7 +340,7 @@ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct compat_rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ sema_t m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ Index: 2.6-8xx/fs/xfs/quota/xfs_qm.h =================================================================== --- 2.6-8xx.orig/fs/xfs/quota/xfs_qm.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/quota/xfs_qm.h 2005-06-16 13:45:08.000000000 -0300 @@ -177,8 +177,8 @@ #define XFS_QM_BWARNLIMIT 5 #define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD)) -#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) +#define XFS_QM_LOCK(xqm) mutex_lock(&xqm##_lock, PINOD) +#define XFS_QM_UNLOCK(xqm) mutex_unlock(&xqm##_lock) #define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) #define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) Index: 2.6-8xx/fs/xfs/quota/xfs_quota_priv.h =================================================================== --- 2.6-8xx.orig/fs/xfs/quota/xfs_quota_priv.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/xfs/quota/xfs_quota_priv.h 2005-06-16 13:45:08.000000000 -0300 @@ -64,8 +64,8 @@ #define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) #define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) -#define XQMLCK(h) (mutex_lock(&((h)->qh_lock), PINOD)) -#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) +#define XQMLCK(h) mutex_lock(&((h)->qh_lock), PINOD) +#define XQMUNLCK(h) mutex_unlock(&((h)->qh_lock)) #ifdef DEBUG struct xfs_dqhash; static inline int XQMISLCKD(struct xfs_dqhash *h) Index: 2.6-8xx/fs/proc/array.c =================================================================== --- 2.6-8xx.orig/fs/proc/array.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/proc/array.c 2005-06-16 13:45:08.000000000 -0300 @@ -129,17 +129,19 @@ */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char * get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | Index: 2.6-8xx/fs/proc/proc_misc.c =================================================================== --- 2.6-8xx.orig/fs/proc/proc_misc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/proc/proc_misc.c 2005-06-16 13:45:08.000000000 -0300 @@ -401,6 +401,41 @@ nr_running(), nr_iowait()); +#ifdef CONFIG_PREEMPT_RT + { + unsigned long nr_uninterruptible_cpu(int cpu); + extern int pi_walk, pi_null, pi_prio; + extern int rt_overload_schedule, + rt_overload_wakeup, rt_overload_pulled; + unsigned long rt_nr_running_cpu(int cpu); + extern atomic_t rt_overload; + + int i; + + seq_printf(p, "rt_overload_schedule: %d\n", + rt_overload_schedule); + seq_printf(p, "rt_overload_wakeup: %d\n", + rt_overload_wakeup); + seq_printf(p, "rt_overload_pulled: %d\n", + rt_overload_pulled); + seq_printf(p, "pi_null: %d\n", pi_null); + seq_printf(p, "pi_prio: %d\n", pi_prio); + seq_printf(p, "pi_walk: %d\n", pi_walk); + seq_printf(p, "nr_running(): %ld\n", + nr_running()); + seq_printf(p, "nr_uninterruptible(): %ld\n", + nr_uninterruptible()); + for_each_cpu(i) + seq_printf(p, "nr_uninterruptible(%d): %ld\n", + i, nr_uninterruptible_cpu(i)); + for_each_cpu(i) + seq_printf(p, "rt_nr_running(%d): %ld\n", + i, rt_nr_running_cpu(i)); + seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload)); + + } +#endif + return 0; } @@ -517,6 +552,20 @@ return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_LATENCY_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -596,6 +645,9 @@ #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_LATENCY_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { Index: 2.6-8xx/fs/proc/task_mmu.c =================================================================== --- 2.6-8xx.orig/fs/proc/task_mmu.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/proc/task_mmu.c 2005-06-16 13:45:08.000000000 -0300 @@ -184,8 +184,10 @@ map = NULL; if ((unsigned long)l < mm->map_count) { map = mm->mmap; - while (l-- && map) + while (l-- && map) { map = map->vm_next; + cond_resched(); + } goto out; } Index: 2.6-8xx/fs/nfsd/nfssvc.c =================================================================== --- 2.6-8xx.orig/fs/nfsd/nfssvc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/nfsd/nfssvc.c 2005-06-16 13:45:08.000000000 -0300 @@ -285,6 +285,7 @@ /* Release the thread */ svc_exit_thread(rqstp); + unlock_kernel(); /* Release module */ module_put_and_exit(0); } Index: 2.6-8xx/fs/jbd/journal.c =================================================================== --- 2.6-8xx.orig/fs/jbd/journal.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/jbd/journal.c 2005-06-16 13:45:08.000000000 -0300 @@ -82,6 +82,14 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) +DEFINE_SPINLOCK(journal_bh_state_lock); +DEFINE_SPINLOCK(journal_bh_journal_lock); + +EXPORT_SYMBOL(journal_bh_state_lock); +EXPORT_SYMBOL(journal_bh_journal_lock); +#endif + /* * Helper function used to manage commit timeouts */ Index: 2.6-8xx/fs/pipe.c =================================================================== --- 2.6-8xx.orig/fs/pipe.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/pipe.c 2005-06-16 13:45:08.000000000 -0300 @@ -202,8 +202,14 @@ wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -342,8 +348,14 @@ wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) inode_update_time(inode, 1); /* mtime and ctime */ +#endif return ret; } Index: 2.6-8xx/fs/nfs/inode.c =================================================================== --- 2.6-8xx.orig/fs/nfs/inode.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/nfs/inode.c 2005-06-16 13:45:08.000000000 -0300 @@ -118,7 +118,7 @@ int flags = sync ? FLUSH_WAIT : 0; int ret; - ret = nfs_commit_inode(inode, 0, 0, flags); + ret = nfs_commit_inode(inode, flags); if (ret < 0) return ret; return 0; Index: 2.6-8xx/fs/nfs/write.c =================================================================== --- 2.6-8xx.orig/fs/nfs/write.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/nfs/write.c 2005-06-16 13:45:08.000000000 -0300 @@ -352,7 +352,7 @@ if (err < 0) goto out; } - err = nfs_commit_inode(inode, 0, 0, wb_priority(wbc)); + err = nfs_commit_inode(inode, wb_priority(wbc)); if (err > 0) { wbc->nr_to_write -= err; err = 0; @@ -446,6 +446,8 @@ struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&nfsi->req_lock); + radix_tree_tag_set(&nfsi->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_DIRTY); nfs_list_add_request(req, &nfsi->dirty); nfsi->ndirty++; spin_unlock(&nfsi->req_lock); @@ -503,13 +505,12 @@ spin_lock(&nfsi->req_lock); next = idx_start; - while (radix_tree_gang_lookup(&nfsi->nfs_page_tree, (void **)&req, next, 1)) { + while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) { if (req->wb_index > idx_end) break; next = req->wb_index + 1; - if (!NFS_WBACK_BUSY(req)) - continue; + BUG_ON(!NFS_WBACK_BUSY(req)); atomic_inc(&req->wb_count); spin_unlock(&nfsi->req_lock); @@ -538,12 +539,15 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res; - res = nfs_scan_list(&nfsi->dirty, dst, idx_start, npages); - nfsi->ndirty -= res; - sub_page_state(nr_dirty,res); - if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + int res = 0; + + if (nfsi->ndirty != 0) { + res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages); + nfsi->ndirty -= res; + sub_page_state(nr_dirty,res); + if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + } return res; } @@ -562,11 +566,14 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res; - res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); - nfsi->ncommit -= res; - if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + int res = 0; + + if (nfsi->ncommit != 0) { + res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages); + nfsi->ncommit -= res; + if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + } return res; } #endif @@ -821,7 +828,7 @@ #else nfs_inode_remove_request(req); #endif - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } static inline int flush_task_priority(int how) @@ -952,7 +959,7 @@ nfs_writedata_free(data); } nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); return -ENOMEM; } @@ -1002,7 +1009,7 @@ struct nfs_page *req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return -ENOMEM; } @@ -1029,7 +1036,7 @@ req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return error; } @@ -1121,7 +1128,7 @@ nfs_inode_remove_request(req); #endif next: - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } } @@ -1210,36 +1217,24 @@ struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; - struct nfs_page *first, *last; + struct nfs_page *first; struct inode *inode; - loff_t start, end, len; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ list_splice_init(head, &data->pages); first = nfs_list_entry(data->pages.next); - last = nfs_list_entry(data->pages.prev); inode = first->wb_context->dentry->d_inode; - /* - * Determine the offset range of requests in the COMMIT call. - * We rely on the fact that data->pages is an ordered list... - */ - start = req_offset(first); - end = req_offset(last) + last->wb_bytes; - len = end - start; - /* If 'len' is not a 32-bit quantity, pass '0' in the COMMIT call */ - if (end >= i_size_read(inode) || len < 0 || len > (~((u32)0) >> 1)) - len = 0; - data->inode = inode; data->cred = first->wb_context->cred; data->args.fh = NFS_FH(data->inode); - data->args.offset = start; - data->args.count = len; - data->res.count = len; + /* Note: we always request a commit of the entire inode */ + data->args.offset = 0; + data->args.count = 0; + data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -1278,7 +1273,7 @@ req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_commit(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return -ENOMEM; } @@ -1324,7 +1319,7 @@ dprintk(" mismatch\n"); nfs_mark_request_dirty(req); next: - nfs_unlock_request(req); + nfs_clear_page_writeback(req); res++; } sub_page_state(nr_unstable,res); @@ -1350,8 +1345,7 @@ } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -int nfs_commit_inode(struct inode *inode, unsigned long idx_start, - unsigned int npages, int how) +int nfs_commit_inode(struct inode *inode, int how) { struct nfs_inode *nfsi = NFS_I(inode); LIST_HEAD(head); @@ -1359,15 +1353,13 @@ error = 0; spin_lock(&nfsi->req_lock); - res = nfs_scan_commit(inode, &head, idx_start, npages); + res = nfs_scan_commit(inode, &head, 0, 0); + spin_unlock(&nfsi->req_lock); if (res) { - res += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&nfsi->req_lock); error = nfs_commit_list(&head, how); - } else - spin_unlock(&nfsi->req_lock); - if (error < 0) - return error; + if (error < 0) + return error; + } return res; } #endif @@ -1389,7 +1381,7 @@ error = nfs_flush_inode(inode, idx_start, npages, how); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (error == 0) - error = nfs_commit_inode(inode, idx_start, npages, how); + error = nfs_commit_inode(inode, how); #endif } while (error > 0); return error; Index: 2.6-8xx/fs/nfs/read.c =================================================================== --- 2.6-8xx.orig/fs/nfs/read.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/nfs/read.c 2005-06-16 13:45:08.000000000 -0300 @@ -173,7 +173,6 @@ if (len < PAGE_CACHE_SIZE) memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); - nfs_lock_request(new); nfs_list_add_request(new, &one_request); nfs_pagein_one(&one_request, inode); return 0; @@ -185,7 +184,6 @@ nfs_clear_request(req); nfs_release_request(req); - nfs_unlock_request(req); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, @@ -553,7 +551,6 @@ } if (len < PAGE_CACHE_SIZE) memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); - nfs_lock_request(new); nfs_list_add_request(new, desc->head); return 0; } Index: 2.6-8xx/fs/nfs/pagelist.c =================================================================== --- 2.6-8xx.orig/fs/nfs/pagelist.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/nfs/pagelist.c 2005-06-16 13:45:08.000000000 -0300 @@ -112,6 +112,33 @@ } /** + * nfs_set_page_writeback_locked - Lock a request for writeback + * @req: + */ +int nfs_set_page_writeback_locked(struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + + if (!nfs_lock_request(req)) + return 0; + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + return 1; +} + +/** + * nfs_clear_page_writeback - Unlock request and wake up sleepers + */ +void nfs_clear_page_writeback(struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + + spin_lock(&nfsi->req_lock); + radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + spin_unlock(&nfsi->req_lock); + nfs_unlock_request(req); +} + +/** * nfs_clear_request - Free up all resources allocated to the request * @req: * @@ -151,36 +178,6 @@ } /** - * nfs_list_add_request - Insert a request into a sorted list - * @req: request - * @head: head of list into which to insert the request. - * - * Note that the wb_list is sorted by page index in order to facilitate - * coalescing of requests. - * We use an insertion sort that is optimized for the case of appended - * writes. - */ -void -nfs_list_add_request(struct nfs_page *req, struct list_head *head) -{ - struct list_head *pos; - -#ifdef NFS_PARANOIA - if (!list_empty(&req->wb_list)) { - printk(KERN_ERR "NFS: Add to list failed!\n"); - BUG(); - } -#endif - list_for_each_prev(pos, head) { - struct nfs_page *p = nfs_list_entry(pos); - if (p->wb_index < req->wb_index) - break; - } - list_add(&req->wb_list, pos); - req->wb_list_head = head; -} - -/** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. * @@ -243,6 +240,63 @@ return npages; } +#define NFS_SCAN_MAXENTRIES 16 +/** + * nfs_scan_lock_dirty - Scan the radix tree for dirty requests + * @nfsi: NFS inode + * @dst: Destination list + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space + * starting at index idx_start, is scanned. + * The requests are *not* checked to ensure that they form a contiguous set. + * You must be holding the inode's req_lock when calling this function + */ +int +nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst, + unsigned long idx_start, unsigned int npages) +{ + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; + unsigned long idx_end; + int found, i; + int res; + + res = 0; + if (npages == 0) + idx_end = ~0; + else + idx_end = idx_start + npages - 1; + + for (;;) { + found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, + (void **)&pgvec[0], idx_start, NFS_SCAN_MAXENTRIES, + NFS_PAGE_TAG_DIRTY); + if (found <= 0) + break; + for (i = 0; i < found; i++) { + req = pgvec[i]; + if (req->wb_index > idx_end) + goto out; + + idx_start = req->wb_index + 1; + + if (nfs_set_page_writeback_locked(req)) { + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_DIRTY); + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + res++; + } + } + cond_resched_lock(&nfsi->req_lock); + } +out: + return res; +} + /** * nfs_scan_list - Scan a list for matching requests * @head: One of the NFS inode request lists @@ -257,10 +311,12 @@ * You must be holding the inode's req_lock when calling this function */ int -nfs_scan_list(struct list_head *head, struct list_head *dst, - unsigned long idx_start, unsigned int npages) +nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, + struct list_head *dst, unsigned long idx_start, + unsigned int npages) { - struct list_head *pos, *tmp; + LIST_HEAD(locked); + struct list_head *pos; struct nfs_page *req; unsigned long idx_end; int res; @@ -271,21 +327,22 @@ else idx_end = idx_start + npages - 1; - list_for_each_safe(pos, tmp, head) { + while (!list_empty(head)) { + pos = head->next; req = nfs_list_entry(pos); - if (req->wb_index < idx_start) - continue; - if (req->wb_index > idx_end) - break; - - if (!nfs_lock_request(req)) - continue; - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - res++; + if (!nfs_set_page_writeback_locked(req)) { + list_del(pos); + list_add(&req->wb_list, &locked); + } else { + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + res++; + } + cond_resched_lock(&nfsi->req_lock); } + list_splice(&locked, head); return res; } Index: 2.6-8xx/fs/lockd/svc.c =================================================================== --- 2.6-8xx.orig/fs/lockd/svc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/lockd/svc.c 2005-06-16 13:45:08.000000000 -0300 @@ -49,7 +49,7 @@ int nlmsvc_grace_period; unsigned long nlmsvc_timeout; -static DECLARE_MUTEX_LOCKED(lockd_start); +static DECLARE_WAIT_QUEUE_HEAD(lockd_start); static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); /* @@ -112,7 +112,7 @@ * Let our maker know we're running. */ nlmsvc_pid = current->pid; - up(&lockd_start); + wake_up(&lockd_start); daemonize("lockd"); @@ -261,8 +261,15 @@ "lockd_up: create thread failed, error=%d\n", error); goto destroy_and_out; } - down(&lockd_start); - + /* + * Wait for the lockd process to start, but since we're holding + * the lockd semaphore, we can't wait around forever ... + */ + if (wait_event_interruptible_timeout(lockd_start, + nlmsvc_pid != 0, HZ) <= 0) { + printk(KERN_WARNING + "lockd_down: lockd failed to start\n"); + } /* * Note: svc_serv structures have an initial use count of 1, * so we exit through here on both success and failure. @@ -302,16 +309,12 @@ * Wait for the lockd process to exit, but since we're holding * the lockd semaphore, we can't wait around forever ... */ - clear_thread_flag(TIF_SIGPENDING); - interruptible_sleep_on_timeout(&lockd_exit, HZ); - if (nlmsvc_pid) { + if (wait_event_interruptible_timeout(lockd_exit, + nlmsvc_pid == 0, HZ) <= 0) { printk(KERN_WARNING "lockd_down: lockd failed to exit, clearing pid\n"); nlmsvc_pid = 0; } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); out: up(&nlmsvc_sema); } Index: 2.6-8xx/fs/fcntl.c =================================================================== --- 2.6-8xx.orig/fs/fcntl.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/fcntl.c 2005-06-16 13:45:08.000000000 -0300 @@ -442,7 +442,8 @@ break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - send_group_sig_info(SIGIO, SEND_SIG_PRIV, p); + // we hold the tasklist lock already: + group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); } } @@ -476,7 +477,7 @@ struct fown_struct *fown) { if (sigio_perm(p, fown, SIGURG)) - send_group_sig_info(SIGURG, SEND_SIG_PRIV, p); + group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); } int send_sigurg(struct fown_struct *fown) Index: 2.6-8xx/fs/ext3/balloc.c =================================================================== --- 2.6-8xx.orig/fs/ext3/balloc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/ext3/balloc.c 2005-06-16 13:45:08.000000000 -0300 @@ -749,24 +749,24 @@ * to find a free region that is of my size and has not * been reserved. * - * on succeed, it returns the reservation window to be appended to. - * failed, return NULL. */ -static struct ext3_reserve_window_node *find_next_reservable_window( +static int find_next_reservable_window( struct ext3_reserve_window_node *search_head, - unsigned long size, int *start_block, + struct ext3_reserve_window_node *my_rsv, + struct super_block * sb, int start_block, int last_block) { struct rb_node *next; struct ext3_reserve_window_node *rsv, *prev; int cur; + int size = my_rsv->rsv_goal_size; /* TODO: make the start of the reservation window byte-aligned */ /* cur = *start_block & ~7;*/ - cur = *start_block; + cur = start_block; rsv = search_head; if (!rsv) - return NULL; + return -1; while (1) { if (cur <= rsv->rsv_end) @@ -782,7 +782,7 @@ * space with expected-size (or more)... */ if (cur > last_block) - return NULL; /* fail */ + return -1; /* fail */ prev = rsv; next = rb_next(&rsv->rsv_node); @@ -813,8 +813,26 @@ * return the reservation window that we could append to. * succeed. */ - *start_block = cur; - return prev; + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* let's book the whole avaliable window for now + * we will check the + * disk bitmap later and then, if there are free block + * then we adjust the window size if the it's + * larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext3_rsv_window_add(sb, my_rsv); + + return 0; } /** @@ -852,6 +870,7 @@ * @sb: the super block * @group: the group we are trying to allocate in * @bitmap_bh: the block group block bitmap + * */ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, int goal, struct super_block *sb, @@ -860,10 +879,10 @@ struct ext3_reserve_window_node *search_head; int group_first_block, group_end_block, start_block; int first_free_block; - int reservable_space_start; - struct ext3_reserve_window_node *prev_rsv; struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + group * EXT3_BLOCKS_PER_GROUP(sb); @@ -875,6 +894,7 @@ start_block = goal + group_first_block; size = my_rsv->rsv_goal_size; + if (!rsv_is_empty(&my_rsv->rsv_window)) { /* * if the old reservation is cross group boundary @@ -908,6 +928,8 @@ my_rsv->rsv_goal_size= size; } } + + spin_lock(rsv_lock); /* * shift the search start to the window near the goal block */ @@ -921,11 +943,16 @@ * need to check the bitmap after we found a reservable window. */ retry: - prev_rsv = find_next_reservable_window(search_head, size, - &start_block, group_end_block); - if (prev_rsv == NULL) - goto failed; - reservable_space_start = start_block; + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + /* * On success, find_next_reservable_window() returns the * reservation window where there is a reservable space after it. @@ -937,8 +964,9 @@ * block. Search start from the start block of the reservable space * we just found. */ + spin_unlock(rsv_lock); first_free_block = bitmap_search_next_usable_block( - reservable_space_start - group_first_block, + my_rsv->rsv_start - group_first_block, bitmap_bh, group_end_block - group_first_block + 1); if (first_free_block < 0) { @@ -946,54 +974,30 @@ * no free block left on the bitmap, no point * to reserve the space. return failed. */ - goto failed; + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ } + start_block = first_free_block + group_first_block; /* * check if the first free block is within the - * free space we just found + * free space we just reserved */ - if ((start_block >= reservable_space_start) && - (start_block < reservable_space_start + size)) - goto found_rsv_window; + if ((start_block >= my_rsv->rsv_start) && + (start_block < my_rsv->rsv_end)) + return 0; /* succeed */ /* * if the first free bit we found is out of the reservable space - * this means there is no free block on the reservable space - * we should continue search for next reservable space, + * continue search for next reservable space, * start from where the free block is, * we also shift the list head to where we stopped last time */ - search_head = prev_rsv; + search_head = my_rsv; + spin_lock(rsv_lock); goto retry; - -found_rsv_window: - /* - * great! the reservable space contains some free blocks. - * if the search returns that we should add the new - * window just next to where the old window, we don't - * need to remove the old window first then add it to the - * same place, just update the new start and new end. - */ - if (my_rsv != prev_rsv) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - } - my_rsv->rsv_start = reservable_space_start; - my_rsv->rsv_end = my_rsv->rsv_start + size - 1; - my_rsv->rsv_alloc_hit = 0; - if (my_rsv != prev_rsv) { - ext3_rsv_window_add(sb, my_rsv); - } - return 0; /* succeed */ -failed: - /* - * failed to find a new reservation window in the current - * group, remove the current(stale) reservation window - * if there is any - */ - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - return -1; /* failed */ } /* @@ -1023,7 +1027,6 @@ int goal, struct ext3_reserve_window_node * my_rsv, int *errp) { - spinlock_t *rsv_lock; unsigned long group_first_block; int ret = 0; int fatal; @@ -1052,7 +1055,6 @@ ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); goto out; } - rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; /* * goal is a group relative block number (if there is a goal) * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) @@ -1078,30 +1080,21 @@ * then we could go to allocate from the reservation window directly. */ while (1) { - struct ext3_reserve_window rsv_copy; - - rsv_copy._rsv_start = my_rsv->rsv_start; - rsv_copy._rsv_end = my_rsv->rsv_end; - - if (rsv_is_empty(&rsv_copy) || (ret < 0) || - !goal_in_my_reservation(&rsv_copy, goal, group, sb)) { - spin_lock(rsv_lock); + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { ret = alloc_new_reservation(my_rsv, goal, sb, group, bitmap_bh); - rsv_copy._rsv_start = my_rsv->rsv_start; - rsv_copy._rsv_end = my_rsv->rsv_end; - spin_unlock(rsv_lock); if (ret < 0) break; /* failed */ - if (!goal_in_my_reservation(&rsv_copy, goal, group, sb)) + if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) goal = -1; } - if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) - || (rsv_copy._rsv_end < group_first_block)) + if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) + || (my_rsv->rsv_end < group_first_block)) BUG(); ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, - &rsv_copy); + &my_rsv->rsv_window); if (ret >= 0) { my_rsv->rsv_alloc_hit++; break; /* succeed */ Index: 2.6-8xx/fs/ext3/file.c =================================================================== --- 2.6-8xx.orig/fs/ext3/file.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/ext3/file.c 2005-06-16 13:45:08.000000000 -0300 @@ -36,7 +36,11 @@ /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) + { + down(&EXT3_I(inode)->truncate_sem); ext3_discard_reservation(inode); + up(&EXT3_I(inode)->truncate_sem); + } if (is_dx(inode) && filp->private_data) ext3_htree_free_dir_info(filp->private_data); Index: 2.6-8xx/fs/exec.c =================================================================== --- 2.6-8xx.orig/fs/exec.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/exec.c 2005-06-16 13:45:08.000000000 -0300 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -568,11 +569,16 @@ } } task_lock(tsk); + + local_irq_disable(); // FIXME active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); @@ -667,7 +673,7 @@ * of the time. */ while (leader->exit_state != EXIT_ZOMBIE) - yield(); + msleep(1); spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); @@ -1416,9 +1422,6 @@ mm->core_waiters++; /* let other threads block */ mm->core_startup_done = &startup_done; - /* give other threads a chance to run: */ - yield(); - zap_threads(mm); if (--mm->core_waiters) { up_write(&mm->mmap_sem); Index: 2.6-8xx/fs/devfs/base.c =================================================================== --- 2.6-8xx.orig/fs/devfs/base.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/devfs/base.c 2005-06-16 13:45:08.000000000 -0300 @@ -826,7 +826,7 @@ wait_queue_head_t revalidate_wait_queue; /* Wake when devfsd sleeps */ }; -static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED }; +static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED(fs_info.devfsd_buffer_lock) }; static kmem_cache_t *devfsd_buf_cache; #ifdef CONFIG_DEVFS_DEBUG static unsigned int devfs_debug_init __initdata = DEBUG_NONE; Index: 2.6-8xx/fs/aio.c =================================================================== --- 2.6-8xx.orig/fs/aio.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/aio.c 2005-06-16 13:45:08.000000000 -0300 @@ -564,9 +564,11 @@ tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); + local_irq_disable(); // FIXME + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); task_unlock(tsk); mmdrop(active_mm); Index: 2.6-8xx/fs/dcache.c =================================================================== --- 2.6-8xx.orig/fs/dcache.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/fs/dcache.c 2005-06-16 13:45:08.000000000 -0300 @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); -static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +static DECLARE_SEQLOCK(rename_lock); EXPORT_SYMBOL(dcache_lock); Index: 2.6-8xx/mm/slab.c =================================================================== --- 2.6-8xx.orig/mm/slab.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/slab.c 2005-06-16 13:45:08.000000000 -0300 @@ -537,7 +537,7 @@ .limit = BOOT_CPUCACHE_ENTRIES, .objsize = sizeof(kmem_cache_t), .flags = SLAB_NO_REAP, - .spinlock = SPIN_LOCK_UNLOCKED, + .spinlock = SPIN_LOCK_UNLOCKED(cache_cache.spinlock), .name = "kmem_cache", #if DEBUG .reallen = sizeof(kmem_cache_t), @@ -578,9 +578,9 @@ return (void**)(ac+1); } -static inline struct array_cache *ac_data(kmem_cache_t *cachep) +static inline struct array_cache *ac_data(kmem_cache_t *cachep, int cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[cpu]; } static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags) @@ -833,21 +833,22 @@ /* 4) Replace the bootstrap head arrays */ { void * ptr; + int cpu = smp_processor_id(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); - cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); + local_irq_disable_nort(); + BUG_ON(ac_data(&cache_cache, cpu) != &initarray_cache.cache); + memcpy(ptr, ac_data(&cache_cache, cpu), sizeof(struct arraycache_init)); + cache_cache.array[cpu] = ptr; + local_irq_enable_nort(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); - memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), + local_irq_disable_nort(); + BUG_ON(ac_data(malloc_sizes[0].cs_cachep, cpu) != &initarray_generic.cache); + memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep, cpu), sizeof(struct arraycache_init)); - malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; - local_irq_enable(); + malloc_sizes[0].cs_cachep->array[cpu] = ptr; + local_irq_enable_nort(); } /* 5) resize the head arrays to their final sizes */ @@ -972,7 +973,7 @@ *addr++=0x12345678; *addr++=caller; - *addr++=smp_processor_id(); + *addr++=_smp_processor_id(); size -= 3*sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1203,6 +1204,7 @@ { size_t left_over, slab_size, ralign; kmem_cache_t *cachep = NULL; + int cpu = _smp_processor_id(); /* * Sanity checks... these are all serious usage bugs. @@ -1440,16 +1442,16 @@ * the cache that's used by kmalloc(24), otherwise * the creation of further caches will BUG(). */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; + cachep->array[cpu] = &initarray_generic.cache; g_cpucache_up = PARTIAL; } else { - cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); + cachep->array[cpu] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); } - BUG_ON(!ac_data(cachep)); - ac_data(cachep)->avail = 0; - ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - ac_data(cachep)->batchcount = 1; - ac_data(cachep)->touched = 0; + BUG_ON(!ac_data(cachep, cpu)); + ac_data(cachep, cpu)->avail = 0; + ac_data(cachep, cpu)->limit = BOOT_CPUCACHE_ENTRIES; + ac_data(cachep, cpu)->batchcount = 1; + ac_data(cachep, cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount @@ -1503,12 +1505,14 @@ #if DEBUG static void check_irq_off(void) { - BUG_ON(!irqs_disabled()); +#ifndef CONFIG_PREEMPT_RT + BUG_ON(!raw_irqs_disabled()); +#endif } static void check_irq_on(void) { - BUG_ON(irqs_disabled()); + BUG_ON(raw_irqs_disabled()); } static void check_spinlock_acquired(kmem_cache_t *cachep) @@ -1545,22 +1549,39 @@ static void drain_array_locked(kmem_cache_t* cachep, struct array_cache *ac, int force); -static void do_drain(void *arg) +static void do_drain_cpu(kmem_cache_t *cachep, int cpu) { - kmem_cache_t *cachep = (kmem_cache_t*)arg; struct array_cache *ac; check_irq_off(); - ac = ac_data(cachep); + spin_lock(&cachep->spinlock); + ac = ac_data(cachep, cpu); free_block(cachep, &ac_entry(ac)[0], ac->avail); - spin_unlock(&cachep->spinlock); ac->avail = 0; + spin_unlock(&cachep->spinlock); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * Executes in an IRQ context: + */ +static void do_drain(void *arg) +{ + do_drain_cpu((kmem_cache_t*)arg, smp_processor_id()); } +#endif static void drain_cpu_caches(kmem_cache_t *cachep) { +#ifndef CONFIG_PREEMPT_RT smp_call_function_all_cpus(do_drain, cachep); +#else + int cpu; + + for_each_online_cpu(cpu) + do_drain_cpu(cachep, cpu); +#endif check_irq_on(); spin_lock_irq(&cachep->spinlock); if (cachep->lists.shared) @@ -1827,7 +1848,7 @@ spin_unlock(&cachep->spinlock); if (local_flags & __GFP_WAIT) - local_irq_enable(); + local_irq_enable_nort(); /* * The test for missing atomic flag is performed here, rather than @@ -1851,7 +1872,7 @@ cache_init_objs(cachep, slabp, ctor_flags); if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_irq_disable_nort(); check_irq_off(); spin_lock(&cachep->spinlock); @@ -1865,7 +1886,7 @@ kmem_freepages(cachep, objp); failed: if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_irq_disable_nort(); return 0; } @@ -1991,14 +2012,14 @@ #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags) +static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags, int cpu) { int batchcount; struct kmem_list3 *l3; struct array_cache *ac; check_irq_off(); - ac = ac_data(cachep); + ac = ac_data(cachep, cpu); retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2011,7 +2032,7 @@ l3 = list3_data(cachep); BUG_ON(ac->avail > 0); - spin_lock(&cachep->spinlock); + spin_lock_nort(&cachep->spinlock); if (l3->shared) { struct array_cache *shared_array = l3->shared; if (shared_array->avail) { @@ -2069,14 +2090,17 @@ must_grow: l3->free_objects -= ac->avail; alloc_done: - spin_unlock(&cachep->spinlock); + spin_unlock_nort(&cachep->spinlock); if (unlikely(!ac->avail)) { int x; + spin_unlock_rt(&cachep->spinlock); x = cache_grow(cachep, flags, -1); - + + spin_lock_rt(&cachep->spinlock); // cache_grow can reenable interrupts, then ac could change. - ac = ac_data(cachep); + cpu = smp_processor_id_rt(cpu); + ac = ac_data(cachep, cpu); if (!x && ac->avail == 0) // no objects in sight? abort return NULL; @@ -2145,23 +2169,26 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) { + int cpu = _smp_processor_id(); unsigned long save_flags; void* objp; struct array_cache *ac; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - ac = ac_data(cachep); + local_irq_save_nort(save_flags); + spin_lock_rt(&cachep->spinlock); + ac = ac_data(cachep, cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac_entry(ac)[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, cpu); } - local_irq_restore(save_flags); + spin_unlock_rt(&cachep->spinlock); + local_irq_restore_nort(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0)); return objp; } @@ -2231,7 +2258,7 @@ BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - spin_lock(&cachep->spinlock); + spin_lock_nort(&cachep->spinlock); if (cachep->lists.shared) { struct array_cache *shared_array = cachep->lists.shared; int max = shared_array->limit-shared_array->avail; @@ -2266,7 +2293,7 @@ STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&cachep->spinlock); + spin_unlock_nort(&cachep->spinlock); ac->avail -= batchcount; memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], sizeof(void*)*ac->avail); @@ -2281,20 +2308,22 @@ */ static inline void __cache_free(kmem_cache_t *cachep, void *objp) { - struct array_cache *ac = ac_data(cachep); + int cpu = _smp_processor_id(); + struct array_cache *ac = ac_data(cachep, cpu); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + spin_lock_rt(&cachep->spinlock); if (likely(ac->avail < ac->limit)) { STATS_INC_FREEHIT(cachep); ac_entry(ac)[ac->avail++] = objp; - return; } else { STATS_INC_FREEMISS(cachep); cache_flusharray(cachep, ac); ac_entry(ac)[ac->avail++] = objp; } + spin_unlock_rt(&cachep->spinlock); } /** @@ -2395,12 +2424,12 @@ } spin_unlock_irq(&cachep->spinlock); - local_irq_disable(); + local_irq_disable_nort(); if (!cache_grow(cachep, flags, nodeid)) { - local_irq_enable(); + local_irq_enable_nort(); return NULL; } - local_irq_enable(); + local_irq_enable_nort(); } got_slabp: /* found one: allocate object */ @@ -2542,9 +2571,9 @@ { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); __cache_free(cachep, objp); - local_irq_restore(flags); + local_irq_restore_nort(flags); } EXPORT_SYMBOL(kmem_cache_free); @@ -2568,6 +2597,21 @@ } EXPORT_SYMBOL(kcalloc); +#ifdef CONFIG_RT_DEADLOCK_DETECT +static size_t cache_size(kmem_cache_t *c) +{ + struct cache_sizes *csizep = malloc_sizes; + + for ( ; csizep->cs_size; csizep++) { + if (csizep->cs_cachep == c) + return csizep->cs_size; + if (csizep->cs_dmacachep == c) + return csizep->cs_size; + } + return 0; +} +#endif + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -2582,11 +2626,16 @@ if (unlikely(!objp)) return; - local_irq_save(flags); + local_irq_save_nort(flags); kfree_debugcheck(objp); c = GET_PAGE_CACHE(virt_to_page(objp)); +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (check_no_locks_freed(objp, objp+cache_size(c))) + printk("slab %s[%p] (%d), obj: %p\n", + c->name, c, c->objsize, objp); +#endif __cache_free(c, (void*)objp); - local_irq_restore(flags); + local_irq_restore_nort(flags); } EXPORT_SYMBOL(kfree); @@ -2625,13 +2674,17 @@ struct array_cache *new[NR_CPUS]; }; +/* + * Executes in IRQ context: + */ static void do_ccupdate_local(void *info) { struct ccupdate_struct *new = (struct ccupdate_struct *)info; struct array_cache *old; +// WARN_ON(!in_interrupt()); check_irq_off(); - old = ac_data(new->cachep); + old = ac_data(new->cachep, smp_processor_id()); new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; new->new[smp_processor_id()] = old; @@ -2740,6 +2793,10 @@ if (limit > 32) limit = 32; #endif +#ifdef CONFIG_PREEMPT + if (limit > 16) + limit = 16; +#endif err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", @@ -2779,11 +2836,12 @@ */ static void cache_reap(void *unused) { + int cpu = _smp_processor_id(); struct list_head *walk; if (down_trylock(&cache_chain_sem)) { /* Give up. Setup the next iteration. */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + cpu); return; } @@ -2802,7 +2860,7 @@ spin_lock_irq(&searchp->spinlock); - drain_array_locked(searchp, ac_data(searchp), 0); + drain_array_locked(searchp, ac_data(searchp, cpu), 0); if(time_after(searchp->lists.next_reap, jiffies)) goto next_unlock; @@ -2846,7 +2904,7 @@ check_irq_on(); up(&cache_chain_sem); /* Setup the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC+cpu); } #ifdef CONFIG_PROC_FS @@ -3067,10 +3125,10 @@ unsigned int size = 0; if (likely(objp != NULL)) { - local_irq_save(flags); + local_irq_save_nort(flags); c = GET_PAGE_CACHE(virt_to_page(objp)); size = kmem_cache_size(c); - local_irq_restore(flags); + local_irq_restore_nort(flags); } return size; Index: 2.6-8xx/mm/highmem.c =================================================================== --- 2.6-8xx.orig/mm/highmem.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/highmem.c 2005-06-16 13:45:08.000000000 -0300 @@ -242,11 +242,11 @@ unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ Index: 2.6-8xx/mm/page-writeback.c =================================================================== --- 2.6-8xx.orig/mm/page-writeback.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/page-writeback.c 2005-06-16 13:45:08.000000000 -0300 @@ -368,10 +368,8 @@ static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static struct timer_list wb_timer = - TIMER_INITIALIZER(wb_timer_fn, 0, 0); -static struct timer_list laptop_mode_wb_timer = - TIMER_INITIALIZER(laptop_timer_fn, 0, 0); +static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); +static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* * Periodic writeback of "old" data. Index: 2.6-8xx/mm/page_alloc.c =================================================================== --- 2.6-8xx.orig/mm/page_alloc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/page_alloc.c 2005-06-16 13:45:08.000000000 -0300 @@ -369,6 +369,9 @@ int i; arch_free_page(page, order); + if (!PageHighMem(page)) + check_no_locks_freed(page_address(page), + page_address(page+(1<lock, flags); return allocated; } +#endif #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) @@ -598,6 +603,7 @@ #endif } +#if !defined(CONFIG_PREEMPT_RT) /* * Free a 0-order page */ @@ -624,15 +630,32 @@ local_irq_restore(flags); put_cpu(); } +#endif +/* + * On PREEMPT_RT we use a simple solution for the time being, + * per-CPU allocation is disabled. + */ void fastcall free_hot_page(struct page *page) { +#if defined(CONFIG_PREEMPT_RT) + if (PageAnon(page)) + page->mapping = NULL; + __free_pages_ok(page, 0); +#else free_hot_cold_page(page, 0); +#endif } void fastcall free_cold_page(struct page *page) { +#ifdef CONFIG_PREEMPT_RT + if (PageAnon(page)) + page->mapping = NULL; + __free_pages_ok(page, 0); +#else free_hot_cold_page(page, 1); +#endif } static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) @@ -654,6 +677,7 @@ { unsigned long flags; struct page *page = NULL; +#if !defined(CONFIG_PREEMPT_RT) int cold = !!(gfp_flags & __GFP_COLD); if (order == 0) { @@ -672,6 +696,7 @@ local_irq_restore(flags); put_cpu(); } +#endif if (page == NULL) { spin_lock_irqsave(&zone->lock, flags); @@ -950,8 +975,15 @@ { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { +#if defined(CONFIG_PREEMPT_RT) + if (PageAnon(pvec->pages[i])) + pvec->pages[i]->mapping = NULL; + __free_pages_ok(pvec->pages[i], 0); +#else free_hot_cold_page(pvec->pages[i], pvec->cold); +#endif + } } fastcall void __free_pages(struct page *page, unsigned int order) Index: 2.6-8xx/mm/swap.c =================================================================== --- 2.6-8xx.orig/mm/swap.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/swap.c 2005-06-16 13:45:08.000000000 -0300 @@ -136,39 +136,45 @@ * lru_cache_add: add a page to the page lists * @page: the page to add */ -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, }; void fastcall lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvecs); + put_cpu_var_locked(lru_add_pvecs, cpu); } void fastcall lru_cache_add_active(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + put_cpu_var_locked(lru_add_active_pvecs, cpu); } void lru_add_drain(void) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec; + pvec = &get_cpu_var_locked(lru_add_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); + put_cpu_var_locked(lru_add_pvecs, cpu); + + pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_pvecs); + put_cpu_var_locked(lru_add_active_pvecs, cpu); } /* Index: 2.6-8xx/mm/mmap.c =================================================================== --- 2.6-8xx.orig/mm/mmap.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/mmap.c 2005-06-16 13:45:08.000000000 -0300 @@ -1794,10 +1794,16 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_KERNEL - if (unlikely(down_read_trylock(&mm->mmap_sem))) { +# ifdef CONFIG_PREEMPT_RT + if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) { WARN_ON(1); - up_read(&mm->mmap_sem); } +# else + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +# endif #endif } Index: 2.6-8xx/mm/memory.c =================================================================== --- 2.6-8xx.orig/mm/memory.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/mm/memory.c 2005-06-16 13:45:08.000000000 -0300 @@ -116,7 +116,7 @@ pmd_clear(pmd); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; + tlb_mm(tlb)->nr_ptes--; } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -241,7 +241,7 @@ return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb_mm(*tlb), addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -250,7 +250,7 @@ } while (pgd++, addr = next, addr != end); if (!tlb_is_full_mm(*tlb)) - flush_tlb_pgtables((*tlb)->mm, start, end); + flush_tlb_pgtables(tlb_mm(*tlb), start, end); } void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, @@ -551,22 +551,22 @@ page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear(tlb->mm, addr, pte); + ptent = ptep_get_and_clear(tlb_mm(tlb), addr, pte); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) - set_pte_at(tlb->mm, addr, pte, + set_pte_at(tlb_mm(tlb), addr, pte, pgoff_to_pte(page->index)); if (pte_dirty(ptent)) set_page_dirty(page); if (PageAnon(page)) - dec_mm_counter(tlb->mm, anon_rss); + dec_mm_counter(tlb_mm(tlb), anon_rss); else if (pte_young(ptent)) mark_page_accessed(page); - tlb->freed++; + tlb_free(tlb); page_remove_rmap(page); tlb_remove_page(tlb, page); continue; @@ -579,7 +579,7 @@ continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear(tlb->mm, addr, pte); + pte_clear(tlb_mm(tlb), addr, pte); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); } Index: 2.6-8xx/kernel/stop_machine.c =================================================================== --- 2.6-8xx.orig/kernel/stop_machine.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/stop_machine.c 2005-06-16 13:45:08.000000000 -0300 @@ -56,7 +56,7 @@ /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + __yield(); else cpu_relax(); } @@ -110,7 +110,7 @@ /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + __yield(); /* If some failed, kill them all. */ if (ret < 0) { Index: 2.6-8xx/kernel/rt.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/kernel/rt.c 2005-06-16 13:45:08.000000000 -0300 @@ -0,0 +1,2079 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * These flags are used for allowing of stealing of ownerships. + */ +#define RT_PENDOWNER 1 /* pending owner on a lock */ + +#define TASK_PENDING(task) \ + ((task)->rt_flags & RT_PENDOWNER) + +/* + * This flag is good for debugging the PI code - it makes all tasks + * in the system fall under PI handling. Normally only SCHED_FIFO/RR + * tasks are PI-handled: + */ +//#define ALL_TASKS_PI + +/* + * We need a global lock for priority inheritance handling. + * This is only for the slow path, but still, we might want + * to optimize it later to be more scalable. + */ +static __cacheline_aligned_in_smp raw_spinlock_t pi_lock = + RAW_SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_RT_DEADLOCK_DETECT +/* + * We need a global lock when we walk through the multi-process + * lock tree... + */ +static raw_spinlock_t trace_lock = RAW_SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(held_locks); + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +static int trace_on = 1; + +void deadlock_trace_off(void) +{ + trace_on = 0; +} + +#define trace_lock_irq(lock) \ + do { \ + raw_local_irq_disable(); \ + if (trace_on) \ + spin_lock(lock); \ + } while (0) + +#define trace_unlock(lock) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + } while (0) + +#define trace_unlock_irq(lock) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + raw_local_irq_enable(); \ + preempt_check_resched(); \ + } while (0) + +#define trace_lock_irqsave(lock, flags) \ + do { \ + raw_local_irq_save(flags); \ + if (trace_on) \ + spin_lock(lock); \ + } while (0) + +#define trace_unlock_irqrestore(lock, flags) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + raw_local_irq_restore(flags); \ + preempt_check_resched(); \ + } while (0) + +#define TRACE_OFF() \ +do { \ + if (trace_on) { \ + trace_on = 0; \ + console_verbose(); \ + spin_unlock(&trace_lock); \ + } \ +} while (0) + +#define TRACE_BUG() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +#define TRACE_WARN_ON(c) \ +do { \ + if (c) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +# define trace_local_irq_disable() raw_local_irq_disable() +# define trace_local_irq_enable() raw_local_irq_enable() +# define trace_local_irq_restore(flags) raw_local_irq_restore(flags) +# define TRACE_BUG_ON(c) do { if (c) TRACE_BUG(); } while (0) +#else +# define trace_lock_irq(lock) preempt_disable() +# define trace_lock_irqsave(lock, flags) do { (void)flags; preempt_disable(); } while (0) +# define trace_unlock(lock) do { } while (0) + +# define trace_unlock_irq(lock) preempt_enable() +# define trace_unlock_irqrestore(lock, flags) do { (void)flags; preempt_enable(); } while (0) +# define trace_local_irq_disable() preempt_disable() +# define trace_local_irq_enable() preempt_enable() +# define trace_local_irq_restore(flags) do { (void)flags; preempt_enable(); } while (0) + +# define TRACE_BUG() do { } while (0) +# define TRACE_WARN_ON(c) do { } while (0) +# define TRACE_OFF() do { } while (0) +# define TRACE_BUG_ON(c) do { } while (0) +#endif /* CONFIG_RT_DEADLOCK_DETECT */ + +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + spin_lock_init(&pi_lock); +#ifdef CONFIG_RT_DEADLOCK_DETECT + spin_lock_init(&trace_lock); +#endif +} + +#ifdef CONFIG_RT_DEADLOCK_DETECT + +static void printk_task(struct task_struct *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_task_short(struct task_struct *p) +{ + if (p) + printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && lock->owner) { + printk(".. held by: "); + printk_task(lock->owner); + printk("\n"); + } + if (lock->owner) { + printk("... acquired at: "); + print_symbol("%s\n", lock->acquire_eip); + } +} + +static void printk_waiter(struct rt_mutex_waiter *w) +{ + printk("-------------------------\n"); + printk("| waiter struct %p:\n", w); + printk("| w->task:\n"); + printk_task(w->task); + printk("\n| lock:\n"); + printk_lock(w->lock, 1); + printk("| blocked at: "); + print_symbol("%s\n", w->eip); + printk("-------------------------\n"); +} + +static void show_task_locks(struct task_struct *p) +{ + switch (p->state) { + case TASK_RUNNING: printk("R"); break; + case TASK_RUNNING_MUTEX: printk("M"); break; + case TASK_INTERRUPTIBLE: printk("S"); break; + case TASK_UNINTERRUPTIBLE: printk("D"); break; + case TASK_STOPPED: printk("T"); break; + case EXIT_ZOMBIE: printk("Z"); break; + case EXIT_DEAD: printk("X"); break; + default: printk("?"); break; + } + printk_task(p); + if (p->blocked_on) { + struct rt_mutex *lock = p->blocked_on->lock; + + printk(" blocked on:"); + printk_lock(lock, 1); + } else + printk(" (not blocked)\n"); +} + +static void show_held_locks(struct task_struct *filter) +{ + struct list_head *curr, *cursor = NULL; + struct rt_mutex *lock; + struct task_struct *p; + unsigned long flags; + int count = 0; + + printk("\n"); + if (filter) { + printk("------------------------------\n"); + printk("| showing all locks held by: | ("); + printk_task_short(filter); + printk("):\n"); + printk("------------------------------\n"); + } else { + printk("---------------------------\n"); + printk("| showing all locks held: |\n"); + printk("---------------------------\n"); + } + + /* + * Play safe and acquire the global trace lock. We + * cannot printk with that lock held so we iterate + * very carefully: + */ +next: + trace_lock_irqsave(&trace_lock, flags); + list_for_each(curr, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + p = lock->owner; + if (filter && (p != filter)) + continue; + count++; + cursor = curr->next; + trace_unlock_irqrestore(&trace_lock, flags); + + printk("\n#%03d: ", count); + printk_lock(lock, filter ? 0 : 1); + goto next; + } + trace_unlock_irqrestore(&trace_lock, flags); +} + +void show_all_locks(void) +{ + struct task_struct *g, *p; + int count = 10; + int unlock = 1; + + printk("\nshowing all tasks:\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + show_held_locks(NULL); + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +static int check_deadlock(struct rt_mutex *lock, int depth, + unsigned long eip) +{ + struct rt_mutex *lockblk; + struct task_struct *task; + + if (!trace_on) + return 0; + /* + * Special-case: the BKL self-releases at schedule() + * time so it can never deadlock: + */ + if (lock == &kernel_sem.lock) + return 0; + task = lock->owner; + if (!task) + return 0; + lockblk = NULL; + if (task->blocked_on) + lockblk = task->blocked_on->lock; + if (current == task) { + TRACE_OFF(); + if (depth) + return 1; + printk("\n==========================================\n"); + printk( "[ BUG: lock recursion deadlock detected! |\n"); + printk( "------------------------------------------\n"); + printk("already locked: "); + printk_lock(lock, 1); + show_held_locks(task); + printk("\n-{current task's backtrace}----------------->\n"); + dump_stack(); + show_all_locks(); + printk("[ turning off deadlock detection. Please report this trace. ]\n\n"); + trace_local_irq_disable(); + return 0; + } + /* + * Skip the BKL: + */ + if (lockblk == &kernel_sem.lock) + return 0; + /* + * Ugh, something corrupted the lock data structure? + */ + if (depth > 30) { + TRACE_OFF(); + printk("\n===========================================\n"); + printk( "[ BUG: infinite lock dependency detected!? |\n"); + printk( "-------------------------------------------\n"); + goto print_it; + } + if (lockblk && check_deadlock(lockblk, depth+1, eip)) { + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); +print_it: + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task->pid, current->comm, current->pid); + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, current->pid); + printk_lock(lock, 1); + + printk("... trying at: "); + print_symbol("%s\n", eip); + + printk("\n2) %s/%d is blocked on this lock:\n", + task->comm, task->pid); + printk_lock(lockblk, 1); + + show_held_locks(current); + show_held_locks(task); + + printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, current->pid); + dump_stack(); + show_all_locks(); + printk("[ turning off deadlock detection. Please report this trace. ]\n\n"); + trace_local_irq_disable(); + return 0; + } + return 0; +} + +void check_no_held_locks(struct task_struct *task) +{ + struct list_head *curr, *next, *cursor = NULL; + struct plist *curr1; + struct rt_mutex *lock; + struct rt_mutex_waiter *w; + struct task_struct *p; + unsigned long flags; + + if (!trace_on) + return; +restart: + trace_lock_irqsave(&trace_lock, flags); + list_for_each_safe(curr, next, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + p = lock->owner; + if (p != task) + continue; + cursor = next; + list_del_init(curr); + trace_unlock_irqrestore(&trace_lock, flags); + + if (lock == &kernel_sem.lock) { + printk("BUG: %s/%d, BKL held at task exit time!\n", + current->comm, current->pid); + printk("BKL acquired at: "); + print_symbol("%s\n", + (unsigned long) current->last_kernel_lock); + } else + printk("BUG: %s/%d, lock held at task exit time!\n", + current->comm, current->pid); + printk_lock(lock, 1); + if (lock->owner != task) + printk("exiting task is not even the owner??\n"); + goto restart; + } + spin_lock(&pi_lock); + plist_for_each(curr1, &task->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + TRACE_OFF(); + spin_unlock(&pi_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + printk("hm, PI interest held at exit time? Task:\n"); + printk_task(task); + printk_waiter(w); + return; + } + spin_unlock(&pi_lock); + trace_unlock_irqrestore(&trace_lock, flags); +} + +int check_no_locks_freed(const void *from, const void *to) +{ + struct list_head *curr, *next, *cursor = NULL; + struct rt_mutex *lock; + unsigned long flags; + void *lock_addr; + int err = 0; + + if (!trace_on) + return err; +restart: + trace_lock_irqsave(&trace_lock, flags); + list_for_each_safe(curr, next, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + lock_addr = lock; + if (lock_addr < from || lock_addr >= to) + continue; + cursor = next; + list_del_init(curr); + TRACE_OFF(); + trace_unlock_irqrestore(&trace_lock, flags); + err = 1; + + printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", + current->comm, current->pid, lock, from, to); + dump_stack(); + printk_lock(lock, 1); + if (lock->owner != current) + printk("freeing task is not even the owner??\n"); + goto restart; + } + trace_unlock_irqrestore(&trace_lock, flags); + + return err; +} + +#endif + +#if defined(ALL_TASKS_PI) && defined(CONFIG_RT_DEADLOCK_DETECT) + +static void +check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *old_owner) +{ + struct rt_mutex_waiter *w; + struct plist *curr1; + + TRACE_WARN_ON(plist_empty(&waiter->pi_list)); + TRACE_WARN_ON(lock->owner); + + plist_for_each(curr1, &old_owner->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + if (w == waiter) + goto ok; + } + TRACE_WARN_ON(1); +ok: +} + +static void +check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner) +{ + struct rt_mutex_waiter *w; + struct plist *curr1; + + plist_for_each(curr1, &old_owner->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + if (w->lock == lock) { + TRACE_OFF(); + printk("hm, PI interest but no waiter? Old owner:\n"); + printk_waiter(w); + printk("\n"); + TRACE_WARN_ON(1); + return; + } + } +} + +#else + +static inline void +check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *old_owner) +{ +} + +static inline void +check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner) +{ +} + +#endif + +/* + * Move PI waiters of this lock to the new owner: + */ +static void +change_owner(struct rt_mutex *lock, struct task_struct *old_owner, + struct task_struct *new_owner) +{ + struct plist *next1, *curr1; + struct rt_mutex_waiter *w; + int requeued = 0, sum = 0; + + if (old_owner == new_owner) + return; + + plist_for_each_safe(curr1, next1, &old_owner->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + if (w->lock == lock) { + plist_del(&w->pi_list, &old_owner->pi_waiters); + plist_init(&w->pi_list, w->task->prio); + plist_add(&w->pi_list, &new_owner->pi_waiters); + requeued++; + } + sum++; + } + trace_special(sum, requeued, 0); +} + +int pi_walk, pi_null, pi_prio; + +static void pi_setprio(struct rt_mutex *lock, struct task_struct *p, int prio) +{ + if (unlikely(!p->pid)) { + pi_null++; + return; + } + +#ifdef CONFIG_RT_DEADLOCK_DETECT + pi_prio++; + if (p->policy != SCHED_NORMAL && prio > mutex_getprio(p)) { + TRACE_OFF(); + + printk("huh? (%d->%d??)\n", p->prio, prio); + printk("owner:\n"); + printk_task(p); + printk("\ncurrent:\n"); + printk_task(current); + printk("\nlock:\n"); + printk_lock(lock, 1); + dump_stack(); + trace_local_irq_disable(); + } +#endif + /* + * If the task is blocked on some other task then boost that + * other task (or tasks) too: + */ + for (;;) { + struct rt_mutex_waiter *w = p->blocked_on; +#ifdef CONFIG_RT_DEADLOCK_DETECT + int was_rt = rt_task(p); +#endif + + mutex_setprio(p, prio); + if (!w) + break; + /* + * If the task is blocked on a lock, and we just made + * it RT, then register the task in the PI list and + * requeue it to the wait list: + */ + lock = w->lock; + TRACE_BUG_ON(!lock); + TRACE_BUG_ON(!lock->owner); + if (rt_task(p) && plist_empty(&w->pi_list)) { + TRACE_BUG_ON(was_rt); + plist_init(&w->pi_list, prio); + plist_add(&w->pi_list, &lock->owner->pi_waiters); + + plist_del(&w->list, &lock->wait_list); + plist_init(&w->list, prio); + plist_add(&w->list, &lock->wait_list); + + } + /* + * If the task is blocked on a lock, and we just restored + * it from RT to non-RT then unregister the task from + * the PI list and requeue it to the wait list. + * + * (TODO: this can be unfair to SCHED_NORMAL tasks if they + * get PI handled.) + */ + if (!rt_task(p) && !plist_empty(&w->pi_list)) { + TRACE_BUG_ON(!was_rt); + plist_del(&w->pi_list, &lock->owner->pi_waiters); + plist_del(&w->list, &lock->wait_list); + plist_init(&w->list, prio); + plist_add(&w->list, &lock->wait_list); + + } + + pi_walk++; + + p = lock->owner; + TRACE_BUG_ON(!p); + /* + * If the dependee is already higher-prio then + * no need to boost it, and all further tasks down + * the dependency chain are already boosted: + */ + if (p->prio <= prio) + break; + } +} + +static void +task_blocks_on_lock(struct rt_mutex_waiter *waiter, struct task_struct *task, + struct rt_mutex *lock, unsigned long eip) +{ +#ifdef CONFIG_RT_DEADLOCK_DETECT + check_deadlock(lock, 0, eip); + /* mark the current thread as blocked on the lock */ + waiter->eip = eip; +#endif + task->blocked_on = waiter; + waiter->lock = lock; + waiter->task = task; + plist_init(&waiter->pi_list, task->prio); + /* + * Add SCHED_NORMAL tasks to the end of the waitqueue (FIFO): + */ +#ifndef ALL_TASKS_PI + if (!rt_task(task)) { + plist_add(&waiter->list, &lock->wait_list); + return; + } +#endif + spin_lock(&pi_lock); + plist_add(&waiter->pi_list, &lock->owner->pi_waiters); + /* + * Add RT tasks to the head: + */ + plist_add(&waiter->list, &lock->wait_list); + /* + * If the waiter has higher priority than the owner + * then temporarily boost the owner: + */ + if (task->prio < lock->owner->prio) + pi_setprio(lock, lock->owner, task->prio); + spin_unlock(&pi_lock); +} + +/* + * initialise the lock: + */ +static void __init_rt_mutex(struct rt_mutex *lock, int save_state, + char *name, char *file, int line) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + preempt_disable(); + plist_init(&lock->wait_list, MAX_PRIO); + preempt_enable(); +#ifdef CONFIG_RT_DEADLOCK_DETECT + lock->save_state = save_state; + INIT_LIST_HEAD(&lock->held_list); + lock->name = name; + lock->file = file; + lock->line = line; +#endif +} + +void fastcall __init_rwsem(struct rw_semaphore *rwsem, int save_state, + char *name, char *file, int line) +{ + __init_rt_mutex(&rwsem->lock, save_state, name, file, line); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__init_rwsem); + +static inline +void set_new_owner(struct rt_mutex *lock, struct task_struct *old_owner, + struct task_struct *new_owner, unsigned long eip) +{ + if (new_owner) + trace_special_pid(new_owner->pid, new_owner->prio, 0); + if (unlikely(old_owner)) + change_owner(lock, old_owner, new_owner); + lock->owner = new_owner; + lock->owner_prio = new_owner->prio; +#ifdef CONFIG_RT_DEADLOCK_DETECT + TRACE_WARN_ON(!list_empty(&lock->held_list)); + list_add_tail(&lock->held_list, &held_locks); + lock->acquire_eip = eip; +#endif +} + +/* + * handle the lock release when processes blocked on it that can now run + * - the spinlock must be held by the caller + */ +static struct task_struct * pick_new_owner(struct rt_mutex *lock, + struct task_struct *old_owner, int save_state, + unsigned long eip) +{ + struct rt_mutex_waiter *waiter = NULL; + struct task_struct *new_owner; + + /* + * Get the highest prio one: + * + * (same-prio RT tasks go FIFO) + */ + waiter = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list); + + trace_special_pid(waiter->task->pid, waiter->task->prio, 0); + +#ifdef ALL_TASKS_PI + check_pi_list_present(lock, waiter, old_owner); +#endif + new_owner = waiter->task; + plist_del_init(&waiter->list, &lock->wait_list); + + plist_del(&waiter->pi_list, &old_owner->pi_waiters); + plist_init(&waiter->pi_list, waiter->task->prio); + + set_new_owner(lock, old_owner, new_owner, waiter->eip); + /* Don't touch waiter after ->task has been NULLed */ + mb(); + waiter->task = NULL; + new_owner->blocked_on = NULL; + TRACE_WARN_ON(save_state != lock->save_state); + + return new_owner; +} + +static inline void init_lists(struct rt_mutex *lock) +{ + // we have to do this until the static initializers get fixed: + if (unlikely(!lock->wait_list.dp_node.prev && + !lock->wait_list.dp_node.next)) + plist_init(&lock->wait_list, MAX_PRIO); +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (!lock->held_list.prev && !lock->held_list.next) + INIT_LIST_HEAD(&lock->held_list); +#endif +} + +/* + * Try to grab a lock, and if it is owned but the owner + * hasn't woken up yet, see if we can steal it. + * + * Return: 1 if task can grab lock. + * 0 if not. + */ +static int grab_lock(struct rt_mutex *lock, struct task_struct *task) +{ + struct task_struct *owner = lock->owner; + + if (!owner) + return 1; + /* + * The lock is owned, but now test to see if the owner + * is still sleeping and hasn't woken up to get the lock. + */ + + /* Test the simple case first, is it already running? */ + if (!TASK_PENDING(owner)) + return 0; + + /* The owner is pending on a lock, but is it this lock? */ + if (owner->pending_owner != lock) + return 0; + + /* + * There's an owner, but it hasn't woken up to take the lock yet. + * See if we should steal it from him. + */ + if (task->prio > owner->prio) + return 0; + + /* + * The BKL is a PITA. Don't ever steal it + */ + if (lock == &kernel_sem.lock) + return 0; + + /* + * This task is of higher priority than the current pending + * owner, so we may steal it. + */ + owner->rt_flags &= ~RT_PENDOWNER; + owner->pending_owner = NULL; + +#ifdef CONFIG_RT_DEADLOCK_DETECT + /* + * This task will be taking the ownership away, and + * when it does, the lock can't be on the held list. + */ + TRACE_WARN_ON(list_empty(&lock->held_list)); + list_del_init(&lock->held_list); +#endif + return 1; +} + +/* + * Bring a task from pending ownership to owning a lock. + * + * Return 0 if we secured it, otherwise non-zero if it was + * stolen. + */ +static int capture_lock(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + struct rt_mutex *lock = waiter->lock; + unsigned long flags; + int ret = 0; + + /* + * The BKL is special, we always get it. + */ + if (lock == &kernel_sem.lock) + return 0; + + trace_lock_irqsave(&trace_lock, flags); + spin_lock(&lock->wait_lock); + + if (!(task->rt_flags & RT_PENDOWNER)) { + /* someone else stole it */ + TRACE_BUG_ON(lock->owner == task); + if (grab_lock(lock,task)) { + /* we got it back! */ + struct task_struct *old_owner = lock->owner; + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, waiter->eip); + spin_unlock(&pi_lock); + ret = 0; + } else { + /* Add ourselves back to the list */ + task_blocks_on_lock(waiter,task,lock,waiter->eip); + ret = 1; + } + } else { + task->rt_flags &= ~RT_PENDOWNER; + task->pending_owner = NULL; + } + + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return ret; +} + +/* + * lock it semaphore-style: no worries about missed wakeups. + */ +static void __sched __down(struct rt_mutex *lock, unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags, nosched_flag; + struct rt_mutex_waiter waiter; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!raw_irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return; + } + + set_task_state(task, TASK_UNINTERRUPTIBLE); + + plist_init(&waiter.list, task->prio); + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!raw_irqs_disabled()); + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + might_sleep(); + + nosched_flag = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + +wait_again: + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(task, TASK_UNINTERRUPTIBLE); + } + /* + * Check to see if we didn't have ownership stolen. + */ + if (capture_lock(&waiter,task)) { + set_task_state(task, TASK_UNINTERRUPTIBLE); + goto wait_again; + } + + current->flags |= nosched_flag; + task->state = TASK_RUNNING; +} + +/* + * get a write lock on the rw-semaphore + */ +void fastcall __sched rt_down_write(struct rw_semaphore *rwsem) +{ + __down(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_write); + +/* + * get a read lock on the rw-semaphore + */ +void fastcall __sched rt_down_read(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return; + } + return __down(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_read); + +/* + * lock it mutex-style: this variant is very careful not to + * miss any non-mutex wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void __sched __down_mutex(struct rt_mutex *lock, unsigned long eip) +{ + unsigned long state, saved_state, nosched_flag; + struct task_struct *task = current; + struct rt_mutex_waiter waiter; + unsigned long flags; + int got_wakeup = 0; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!raw_irqs_disabled()); + __raw_spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + __raw_spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + __raw_spin_unlock(&pi_lock); + __raw_spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return; + } + + plist_init(&waiter.list, task->prio); + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!raw_irqs_disabled()); + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll + * take any intermediate wakeup into account as well, + * independently of the mutex sleep/wakeup mechanism: + */ + saved_state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + + /* we don't need to touch the lock struct anymore */ + __raw_spin_unlock(&lock->wait_lock); + trace_unlock(&trace_lock); + + /* + * TODO: check 'flags' for the IRQ bit here - it is illegal to + * call down() from an IRQs-off section that results in + * an actual reschedule. + */ + + nosched_flag = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + +wait_again: + /* wait to be given the lock */ + for (;;) { + unsigned long saved_flags = current->flags & PF_NOSCHED; + + if (!waiter.task) + break; + trace_local_irq_enable(); + current->flags &= ~PF_NOSCHED; + schedule(); + current->flags |= saved_flags; + trace_local_irq_disable(); + state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + if (state == TASK_RUNNING) + got_wakeup = 1; + } + /* + * Check to see if we didn't have ownership stolen. + */ + if (capture_lock(&waiter,task)) { + state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + if (state == TASK_RUNNING) + got_wakeup = 1; + goto wait_again; + } + + /* + * Only set the task's state to TASK_RUNNING if it got + * a non-mutex wakeup. We keep the original state otherwise. + * A mutex wakeup changes the task's state to TASK_RUNNING_MUTEX, + * not TASK_RUNNING - hence we can differenciate between the two + * cases: + */ + state = xchg(&task->state, saved_state); + if (state == TASK_RUNNING) + got_wakeup = 1; + if (got_wakeup) + task->state = TASK_RUNNING; + trace_local_irq_enable(); + preempt_check_resched(); + + current->flags |= nosched_flag; +} + +/* + * TODO: push this into __down_mutex() + * + * BKL users expect the BKL to be held across spinlock/rwlock-acquire. + * Save and clear it, this will cause the scheduler to not drop the + * BKL semaphore if we end up scheduling: + */ +#define SAVE_BKL(ACTION) \ +do { \ + struct task_struct *task = current; \ + unsigned int saved_lock_depth; \ + \ + saved_lock_depth = task->lock_depth; \ + task->lock_depth = -1; \ + \ + might_sleep(); \ + ACTION; \ + \ + task->lock_depth = saved_lock_depth; \ +} while (0) + + +static void __sched down_write_mutex(struct rw_semaphore *rwsem, + unsigned long eip) +{ + SAVE_BKL(__down_mutex(&rwsem->lock, eip)); +} + +static void __sched down_read_mutex(struct rw_semaphore *rwsem, + unsigned long eip) +{ + /* + * Read locks within the write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return; + } + SAVE_BKL(__down_mutex(&rwsem->lock, eip)); +} + +/* + * get a lock - interruptible + */ +static int __sched __down_interruptible(struct rt_mutex *lock, + unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags, nosched_flag; + struct rt_mutex_waiter waiter; + int ret; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!raw_irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return 0; + } + + set_task_state(task, TASK_INTERRUPTIBLE); + + plist_init(&waiter.list, task->prio); + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!raw_irqs_disabled()); + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + might_sleep(); + + nosched_flag = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + + ret = 0; +wait_again: + /* wait to be given the lock */ + for (;;) { + if (signal_pending(current)) { + /* + * Remove ourselves from the wait list if we + * didnt get the lock - else return success: + */ + trace_lock_irq(&trace_lock); + spin_lock(&lock->wait_lock); + if (waiter.task) { + plist_del_init(&waiter.list, &lock->wait_list); + /* + * Just remove ourselves from the PI list. + * (No big problem if our PI effect lingers + * a bit - owner will restore prio.) + */ + spin_lock(&pi_lock); + plist_del(&waiter.pi_list, &lock->owner->pi_waiters); + plist_init(&waiter.pi_list, waiter.task->prio); + spin_unlock(&pi_lock); + ret = -EINTR; + } + spin_unlock(&lock->wait_lock); + trace_unlock_irq(&trace_lock); + break; + } + if (!waiter.task) + break; + schedule(); + set_task_state(task, TASK_INTERRUPTIBLE); + } + + /* + * Check to see if we didn't have ownership stolen. + */ + if (!ret) { + if (capture_lock(&waiter,task)) { + set_task_state(task, TASK_INTERRUPTIBLE); + goto wait_again; + } + } + + task->state = TASK_RUNNING; + current->flags |= nosched_flag; + + return ret; +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +static int __down_trylock(struct rt_mutex *lock, unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags; + int ret = 0; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!raw_irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + ret = 1; + } + + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return ret; +} + +int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_write_trylock); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return 1; + } + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static int down_write_trylock_mutex(struct rw_semaphore *rwsem) +{ + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} + +static int down_read_trylock_mutex(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return 1; + } + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} + +/* + * release the lock: + */ +static void __up_mutex(struct rt_mutex *lock, int save_state, unsigned long eip) +{ + struct task_struct *old_owner, *new_owner; + struct rt_mutex_waiter *w; + unsigned long flags; + int prio; + + TRACE_WARN_ON(save_state != lock->save_state); + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!raw_irqs_disabled()); + __raw_spin_lock(&lock->wait_lock); + TRACE_BUG_ON(!lock->wait_list.dp_node.prev && !lock->wait_list.dp_node.next); + +#ifdef CONFIG_RT_DEADLOCK_DETECT + TRACE_WARN_ON(list_empty(&lock->held_list)); + list_del_init(&lock->held_list); +#endif + __raw_spin_lock(&pi_lock); + + old_owner = lock->owner; +#ifdef ALL_TASKS_PI + if (plist_empty(&lock->wait_list)) + check_pi_list_empty(lock, old_owner); +#endif + lock->owner = NULL; + new_owner = NULL; + /* + * For some reason gcc doesnt recognize the unlikely() attribute, + * resulting in slightly suboptimal code generation, so i've + * open-coded !plist_empty() here: + */ +// if (unlikely(!plist_empty(&lock->wait_list))) + if (unlikely(!list_empty(&lock->wait_list.dp_node) || !list_empty(&lock->wait_list.sp_node))) + new_owner = pick_new_owner(lock, old_owner, save_state, eip); + + /* + * If the owner got priority-boosted then restore it + * to the previous priority (or to the next highest prio + * waiter's priority): + */ + prio = old_owner->normal_prio; +// if (unlikely(!plist_empty(&old_owner->pi_waiters))) { + if (unlikely(!list_empty(&old_owner->pi_waiters.dp_node) || !list_empty(&old_owner->pi_waiters.sp_node))) { + w = plist_first_entry(&old_owner->pi_waiters, struct rt_mutex_waiter, pi_list); + if (w->task->prio < prio) + prio = w->task->prio; + } + if (unlikely(prio != old_owner->prio)) + pi_setprio(lock, old_owner, prio); + + if (unlikely(new_owner)) { + if (lock != &kernel_sem.lock) { + new_owner->rt_flags |= RT_PENDOWNER; + new_owner->pending_owner = lock; + } + if (save_state) + wake_up_process_mutex(new_owner); + else + wake_up_process(new_owner); + } + __raw_spin_unlock(&pi_lock); + __raw_spin_unlock(&lock->wait_lock); + +#ifdef PREEMPT_DIRECT + trace_unlock(&trace_lock); + /* + * Common place where preemption is requested - if we can + * reschedule then do it here without enabling interrupts + * again (and lengthening latency): + */ + if (need_resched() && !raw_irqs_disabled_flags(flags) && !preempt_count()) + preempt_schedule_irq(); + trace_local_irq_restore(flags); +#else + trace_unlock_irqrestore(&trace_lock, flags); +#endif + /* no need to check for preempt here - we just handled it */ +} + +/* + * Do owner check too: + */ +void fastcall rt_up_write(struct rw_semaphore *rwsem) +{ + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 0, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_up_write); + +static void _up_write(struct rw_semaphore *rwsem, unsigned long eip) +{ + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 0, eip); +} + +void fastcall up_write_mutex(struct rw_semaphore *rwsem, unsigned long eip) +{ + TRACE_WARN_ON(rwsem->lock.save_state != 1); + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 1, eip); +} + +/* + * release a read lock on the semaphore + */ +void fastcall rt_up_read(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current && rwsem->read_depth) { + rwsem->read_depth--; + return; + } + return _up_write(rwsem, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_up_read); + +void fastcall up_read_mutex(struct rw_semaphore *rwsem, unsigned long eip) +{ + TRACE_WARN_ON(rwsem->lock.save_state != 1); + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current && rwsem->read_depth) { + rwsem->read_depth--; + return; + } + return up_write_mutex(rwsem, eip); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void fastcall rt_downgrade_write(struct rw_semaphore *rwsem) +{ + BUG(); +} +EXPORT_SYMBOL(rt_downgrade_write); + +static int rt_mutex_is_locked(struct rt_mutex *lock) +{ + int ret; + + mb(); + ret = lock->owner != NULL; + + return ret; +} + +int fastcall rt_rwsem_is_locked(struct rw_semaphore *rwsem) +{ + return rt_mutex_is_locked(&rwsem->lock); +} +EXPORT_SYMBOL(rt_rwsem_is_locked); + +static void _down_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + __down_mutex(lock, eip); +} + +void fastcall __sema_init(struct semaphore *sem, int val, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __init_rt_mutex(&sem->lock, 0, name, file, line); + __down(&sem->lock, CALLER_ADDR0); + break; + default: + __init_rt_mutex(&sem->lock, 0, name, file, line); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, + int line) +{ + __sema_init(sem, 1, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX); + +static int down_trylock_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + return __down_trylock(lock, eip); +} + +void fastcall up_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + WARN_ON(lock->owner != current); + __up_mutex(lock, 1, eip); +} + +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem, unsigned long eip) +{ + int count = atomic_dec_return(&sem->count); + + TRACE_WARN_ON(sem->lock.save_state != 0); + WARN_ON(count < 0); + + if (count > 0) + __up_mutex(&sem->lock, 0, eip); +} + +void fastcall rt_down(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + __down(&sem->lock, CALLER_ADDR0); + __down_complete(sem, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down); + +int fastcall rt_down_interruptible(struct semaphore *sem) +{ + int ret; + + TRACE_WARN_ON(sem->lock.save_state != 0); + ret = __down_interruptible(&sem->lock, CALLER_ADDR0); + if (ret) + return ret; + __down_complete(sem, CALLER_ADDR0); + return 0; +} +EXPORT_SYMBOL(rt_down_interruptible); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int fastcall rt_down_trylock(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (__down_trylock(&sem->lock, CALLER_ADDR0)) { + __down_complete(sem, CALLER_ADDR0); + return 0; + } + return 1; +} +EXPORT_SYMBOL(rt_down_trylock); + +void fastcall rt_up(struct semaphore *sem) +{ + int count; + + TRACE_WARN_ON(sem->lock.save_state != 0); + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (count == 1) + __up_mutex(&sem->lock, 0, CALLER_ADDR0); + preempt_enable(); +} +EXPORT_SYMBOL(rt_up); + +int fastcall rt_sem_is_locked(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + return rt_mutex_is_locked(&sem->lock); +} +EXPORT_SYMBOL(rt_sem_is_locked); + +int fastcall rt_sema_count(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + return atomic_read(&sem->count); +} +EXPORT_SYMBOL(rt_sema_count); + +/* + * Spinlock wrappers: + * + * (DEBUG_RT_LOCKING_MODE is a spinning/preempt-disabling variant of the APIs. + * Used for debugging/profiling only.) + */ + +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + +int preempt_locks_user = 0; +int preempt_locks = 0; + +/* + * Called from the idle thread - it is not safe to switch the locking + * mode runtime from a normal process context (locks might be in use) + */ +void propagate_preempt_locks_value(void) +{ + if (preempt_locks != preempt_locks_user) + preempt_locks = preempt_locks_user; +} + +#else +# define preempt_locks 1 +#endif + +static inline void __spin_lock(spinlock_t *lock, unsigned long eip) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + _raw_spin_lock(&lock->lock.debug_slock); + else +#endif + SAVE_BKL(_down_mutex(&lock->lock, eip)); +} + +void __lockfunc _spin_lock(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock); + +void __lockfunc _spin_lock_bh(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock_bh); + +void __lockfunc _spin_lock_irq(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock_irq); + +unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *spin) +{ + unsigned long flags; + + __spin_lock(spin, CALLER_ADDR0); + local_save_flags(flags); + + return flags; +} +EXPORT_SYMBOL(_spin_lock_irqsave); + +static inline void __spin_unlock(spinlock_t *lock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + _raw_spin_unlock(&lock->lock.debug_slock); + else +#endif + up_mutex(&lock->lock, CALLER_ADDR0); +} + +void __lockfunc _spin_unlock(spinlock_t *lock) +{ + __spin_unlock(lock); +} +EXPORT_SYMBOL(_spin_unlock); + +void __lockfunc _spin_unlock_wait(spinlock_t *lock) +{ + do { + barrier(); + } while (spin_is_locked(&lock->lock.wait_lock)); +} +EXPORT_SYMBOL(_spin_unlock_wait); + +void __lockfunc _spin_unlock_bh(spinlock_t *lock) +{ + __spin_unlock(lock); +} +EXPORT_SYMBOL(_spin_unlock_bh); + +void __lockfunc _spin_unlock_irq(spinlock_t *lock) +{ + __spin_unlock(lock); +} +EXPORT_SYMBOL(_spin_unlock_irq); + +void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +{ + __spin_unlock(lock); +} +EXPORT_SYMBOL(_spin_unlock_irqrestore); + +static inline int __spin_trylock(spinlock_t *lock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + return _raw_spin_trylock(&lock->lock.debug_slock); + else +#endif + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} + +int __lockfunc _spin_trylock(spinlock_t *lock) +{ + return __spin_trylock(lock); +} +EXPORT_SYMBOL(_spin_trylock); + +int __lockfunc _spin_trylock_bh(spinlock_t *lock) +{ + return __spin_trylock(lock); +} +EXPORT_SYMBOL(_spin_trylock_bh); + +int __lockfunc _spin_trylock_irq(spinlock_t *lock) +{ + return __spin_trylock(lock); +} +EXPORT_SYMBOL(_spin_trylock_irq); + +int __lockfunc _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + local_save_flags(*flags); + return __spin_trylock(lock); +} +EXPORT_SYMBOL(_spin_trylock_irqsave); + +static inline int __spin_is_locked(spinlock_t *lock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + return _raw_spin_is_locked(&lock->lock.debug_slock); + else +#endif + return rt_mutex_is_locked(&lock->lock); +} + +int _spin_is_locked(spinlock_t *lock) +{ + return __spin_is_locked(lock); +} +EXPORT_SYMBOL(_spin_is_locked); + +int _spin_can_lock(spinlock_t *lock) +{ + return !__spin_is_locked(lock); +} +EXPORT_SYMBOL(_spin_can_lock); + +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) +{ + __spin_lock(lock, CALLER_ADDR0); + if (atomic_dec_and_test(atomic)) + return 1; + __spin_unlock(lock); + + return 0; +} +EXPORT_SYMBOL(atomic_dec_and_spin_lock); + +void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line) +{ + __init_rt_mutex(&lock->lock, 1, name, file, line); +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + _raw_spin_lock_init(&lock->lock.debug_slock); +#endif +} +EXPORT_SYMBOL(_spin_lock_init); + + +/* + * RW-lock wrappers: + */ +int __lockfunc _read_trylock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + return _raw_read_trylock(&rwlock->lock.lock.debug_rwlock); + else +#endif + return down_read_trylock_mutex(&rwlock->lock); +} +EXPORT_SYMBOL(_read_trylock); + +int __lockfunc _write_trylock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + return _raw_write_trylock(&rwlock->lock.lock.debug_rwlock); + else +#endif + return down_write_trylock_mutex(&rwlock->lock); +} +EXPORT_SYMBOL(_write_trylock); + +inline void __lockfunc _write_lock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + _raw_write_lock(&rwlock->lock.lock.debug_rwlock); + else +#endif + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock); + +inline void __lockfunc _read_lock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + _raw_read_lock(&rwlock->lock.lock.debug_rwlock); + else +#endif + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock); + +inline void __lockfunc _write_unlock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + _raw_write_unlock(&rwlock->lock.lock.debug_rwlock); + else +#endif + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock); + +static inline void __read_unlock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + _raw_read_unlock(&rwlock->lock.lock.debug_rwlock); + else +#endif + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} + +void __lockfunc _read_unlock(rwlock_t *rwlock) +{ + __read_unlock(rwlock); +} +EXPORT_SYMBOL(_read_unlock); + +unsigned long __lockfunc _write_lock_irqsave(rwlock_t *rwlock) +{ + unsigned long flags; + + _write_lock(rwlock); + + local_save_flags(flags); + return flags; +} +EXPORT_SYMBOL(_write_lock_irqsave); + +unsigned long __lockfunc _read_lock_irqsave(rwlock_t *rwlock) +{ + unsigned long flags; + + _read_lock(rwlock); + + local_save_flags(flags); + return flags; +} +EXPORT_SYMBOL(_read_lock_irqsave); + +void __lockfunc _write_lock_irq(rwlock_t *rwlock) +{ + _write_lock(rwlock); +} +EXPORT_SYMBOL(_write_lock_irq); + +void __lockfunc _read_lock_irq(rwlock_t *rwlock) +{ + _read_lock(rwlock); +} +EXPORT_SYMBOL(_read_lock_irq); + +void __lockfunc _write_lock_bh(rwlock_t *rwlock) +{ + _write_lock(rwlock); +} +EXPORT_SYMBOL(_write_lock_bh); + +void __lockfunc _read_lock_bh(rwlock_t *rwlock) +{ + _read_lock(rwlock); +} +EXPORT_SYMBOL(_read_lock_bh); + +void __lockfunc _write_unlock_irq(rwlock_t *rwlock) +{ + _write_unlock(rwlock); +} +EXPORT_SYMBOL(_write_unlock_irq); + +void __lockfunc _read_unlock_irq(rwlock_t *rwlock) +{ + _read_unlock(rwlock); +} +EXPORT_SYMBOL(_read_unlock_irq); + +void __lockfunc _write_unlock_bh(rwlock_t *rwlock) +{ + _write_unlock(rwlock); +} +EXPORT_SYMBOL(_write_unlock_bh); + +void __lockfunc _read_unlock_bh(rwlock_t *rwlock) +{ + _read_unlock(rwlock); +} +EXPORT_SYMBOL(_read_unlock_bh); + +void __lockfunc _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags) +{ + _write_unlock(rwlock); +} +EXPORT_SYMBOL(_write_unlock_irqrestore); + +void __lockfunc _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags) +{ + _read_unlock(rwlock); +} +EXPORT_SYMBOL(_read_unlock_irqrestore); + +void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line) +{ + __init_rwsem(&rwlock->lock, 1, name, file, line); +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + _raw_rwlock_init(&rwlock->lock.lock.debug_rwlock); +#endif +} +EXPORT_SYMBOL(_rwlock_init); + +/* + * _read_can_lock() and _write_can_lock() does the same + */ +int _read_can_lock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + return _raw_read_can_lock(&rwlock->lock.lock.debug_rwlock); + else +#endif + return !rt_rwsem_is_locked(&rwlock->lock); +} +EXPORT_SYMBOL(_read_can_lock); + +int _write_can_lock(rwlock_t *rwlock) +{ +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + if (!preempt_locks) + return _raw_write_can_lock(&rwlock->lock.lock.debug_rwlock); + else +#endif + return !rt_rwsem_is_locked(&rwlock->lock); +} +EXPORT_SYMBOL(_write_can_lock); + +/* + * Soft irq-flag support: + */ + +#ifdef CONFIG_DEBUG_PREEMPT +static void check_soft_flags(unsigned long flags) +{ + if (flags & ~IRQSOFF_MASK) { + static int print_once = 1; + if (print_once) { + print_once = 0; + printk("BUG: bad soft irq-flag value %08lx, %s/%d!\n", + flags, current->comm, current->pid); + dump_stack(); + } + } +} +#else +static inline void check_soft_flags(unsigned long flags) +{ +} +#endif + +void local_irq_enable_noresched(void) +{ + unmask_preempt_count(IRQSOFF_MASK); +} +EXPORT_SYMBOL(local_irq_enable_noresched); + +void local_irq_enable(void) +{ + unmask_preempt_count(IRQSOFF_MASK); + preempt_check_resched(); +} +EXPORT_SYMBOL(local_irq_enable); + +void local_irq_disable(void) +{ + mask_preempt_count(IRQSOFF_MASK); +} +EXPORT_SYMBOL(local_irq_disable); + +int irqs_disabled_flags(unsigned long flags) +{ + check_soft_flags(flags); + + return (flags & IRQSOFF_MASK); +} +EXPORT_SYMBOL(irqs_disabled_flags); + +void __local_save_flags(unsigned long *flags) +{ + *flags = irqs_off(); +} +EXPORT_SYMBOL(__local_save_flags); + +void __local_irq_save(unsigned long *flags) +{ + *flags = irqs_off(); + mask_preempt_count(IRQSOFF_MASK); +} +EXPORT_SYMBOL(__local_irq_save); + +void local_irq_restore(unsigned long flags) +{ + check_soft_flags(flags); + if (flags) + mask_preempt_count(IRQSOFF_MASK); + else { + unmask_preempt_count(IRQSOFF_MASK); + preempt_check_resched(); + } +} +EXPORT_SYMBOL(local_irq_restore); + +int irqs_disabled(void) +{ + return irqs_off(); +} +EXPORT_SYMBOL(irqs_disabled); Index: 2.6-8xx/kernel/time.c =================================================================== --- 2.6-8xx.orig/kernel/time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/time.c 2005-06-16 13:45:08.000000000 -0300 @@ -97,8 +97,31 @@ #endif /* __ARCH_WANT_SYS_TIME */ +int timeofday_API_hacks(void *tv, void *tz) +{ +#ifdef CONFIG_LATENCY_TRACE + if (!tv && ((long)tz == 1)) + return user_trace_start(); + if (!tv && !tz) + return user_trace_stop(); +#endif + if (((long)tv == 1) && ((long)tz == 1)) { + current->flags |= PF_NOSCHED; + return 0; + } + if (((long)tv == 1) && ((long)tz == 0)) { + current->flags &= ~PF_NOSCHED; + return 0; + } + return 1; +} + asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) { + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); @@ -184,6 +207,10 @@ struct timespec new_ts; struct timezone new_tz; + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; Index: 2.6-8xx/kernel/exit.c =================================================================== --- 2.6-8xx.orig/kernel/exit.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/exit.c 2005-06-16 13:45:08.000000000 -0300 @@ -49,8 +49,11 @@ if (thread_group_leader(p)) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); - if (p->pid) + if (p->pid) { + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); + } } REMOVE_LINKS(p); @@ -374,8 +377,10 @@ while (set) { if (set & 1) { struct file * file = xchg(&files->fd[i], NULL); - if (file) + if (file) { filp_close(file, files); + cond_resched(); + } } i++; set >>= 1; @@ -505,9 +510,11 @@ if (mm != tsk->active_mm) BUG(); /* more a memory barrier than a real lock */ task_lock(tsk); + preempt_disable(); // FIXME tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); + preempt_enable(); task_unlock(tsk); mmput(mm); } @@ -766,10 +773,6 @@ /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -838,12 +841,18 @@ mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif - - BUG_ON(!(current->flags & PF_DEAD)); - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) ; + check_no_held_locks(tsk); + /* PF_DEAD causes final put_task_struct after we schedule. */ +again: + raw_local_irq_disable(); + tsk->flags |= PF_DEAD; + __schedule(); + printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", + current->comm, current->pid); + printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n", + current->flags, atomic_read(¤t->usage), current->state); + printk(KERN_ERR ".... trying again ...\n"); + goto again; } EXPORT_SYMBOL_GPL(do_exit); @@ -1343,6 +1352,7 @@ list_for_each(_p,&tsk->children) { p = list_entry(_p,struct task_struct,sibling); + BUG_ON(!atomic_read(&p->usage)); ret = eligible_child(pid, options, p); if (!ret) continue; Index: 2.6-8xx/kernel/printk.c =================================================================== --- 2.6-8xx.orig/kernel/printk.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/printk.c 2005-06-16 13:45:08.000000000 -0300 @@ -83,7 +83,7 @@ * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -363,10 +363,12 @@ { struct console *con; + touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write) con->write(con, &LOG_BUF(start), end - start); } + touch_critical_timing(); } /* @@ -470,6 +472,7 @@ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ init_MUTEX(&console_sem); + zap_rt_locks(); } #if defined(CONFIG_PRINTK_TIME) @@ -734,12 +737,21 @@ con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); call_console_drivers(_con_start, _log_end); - local_irq_restore(flags); + raw_local_irq_restore(flags); } console_locked = 0; console_may_schedule = 0; - up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); + up(&console_sem); + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled() && !raw_irqs_disabled()) +#endif if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) wake_up_interruptible(&log_wait); } @@ -970,7 +982,7 @@ */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); + static DEFINE_RAW_SPINLOCK(ratelimit_lock); static unsigned long toks = 10*5*HZ; static unsigned long last_msg; static int missed; Index: 2.6-8xx/kernel/posix-timers.c =================================================================== --- 2.6-8xx.orig/kernel/posix-timers.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/posix-timers.c 2005-06-16 13:45:08.000000000 -0300 @@ -94,7 +94,7 @@ */ #define TIMER_INACTIVE 1 -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) # define timer_active(tmr) \ ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) # define set_timer_inactive(tmr) \ @@ -102,10 +102,28 @@ (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ } while (0) #else -# define timer_active(tmr) BARFY // error to use outside of SMP +# define timer_active(tmr) BARFY /* error to use outside of SMP | RT */ # define set_timer_inactive(tmr) do { } while (0) #endif /* + * For RT the timer call backs are preemptable. This means that folks + * trying to delete timers may run into timers that are "active" for + * long times. To help out with this we provide a wake up function to + * wake up a caller who wants waking when a timer clears the call back. + * This is the same sort of thing that the del_timer_sync does, but we + * need (in the HRT case) to cover two lists and not just the one. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS +#include +static DECLARE_WAIT_QUEUE_HEAD(timer_wake_queue); +#define wake_timer_waiters() wake_up(&timer_wake_queue) +#define wait_for_timer(timer) wait_event(timer_wake_queue, !timer_active(timer)) + +#else +#define wake_timer_waiters() +#define wait_for_timer(timer) +#endif +/* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. */ @@ -170,7 +188,7 @@ * even if we should want to have several clocks with differing resolutions. */ static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), - .lock = SPIN_LOCK_UNLOCKED}; + .lock = SPIN_LOCK_UNLOCKED(abs_list.lock)}; static void posix_timer_fn(unsigned long); static u64 do_posix_clock_monotonic_gettime_parts( @@ -527,6 +545,7 @@ schedule_next_timer(timr); } unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ + wake_timer_waiters(); } @@ -983,18 +1002,20 @@ * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. */ -#ifdef CONFIG_SMP - if (timer_active(timr) && !del_timer(&timr->it.real.timer)) - /* - * It can only be active if on an other cpu. Since - * we have cleared the interval stuff above, it should - * clear once we release the spin lock. Of course once - * we do that anything could happen, including the - * complete melt down of the timer. So return with - * a "retry" exit status. - */ - return TIMER_RETRY; - +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + if (timer_active(timr) && !del_timer(&timr->it.real.timer)) { + /* + * It can only be active if on an other cpu (unless RT). + * Since we have cleared the interval stuff above, it + * should clear once we release the spin lock. Of + * course once we do that anything could happen, + * including the complete melt down of the timer. So + * return with a "retry" exit status. If RT we do a + * formal wait as the function code is fully + * preemptable... + */ + return TIMER_RETRY; + } set_timer_inactive(timr); #else del_timer(&timr->it.real.timer); @@ -1069,7 +1090,8 @@ unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + wait_for_timer(timr); + rtn = NULL; /* We already got the old time... */ goto retry; } @@ -1083,17 +1105,19 @@ static inline int common_timer_del(struct k_itimer *timer) { timer->it.real.incr = 0; -#ifdef CONFIG_SMP - if (timer_active(timer) && !del_timer(&timer->it.real.timer)) - /* - * It can only be active if on an other cpu. Since - * we have cleared the interval stuff above, it should - * clear once we release the spin lock. Of course once - * we do that anything could happen, including the - * complete melt down of the timer. So return with - * a "retry" exit status. - */ - return TIMER_RETRY; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + if (timer_active(timer) && !del_timer(&timer->it.real.timer)) { + /* + * It can only be active if on an other cpu (unless RT). + * Since we have cleared the interval stuff above, it + * should clear once we release the spin lock. Of + * course once we do that anything could happen, + * including the complete melt down of the timer. So + * return with a "retry" exit status. For RT we do a + * formal wait as it could take a while. + */ + return TIMER_RETRY; + } #else del_timer(&timer->it.real.timer); #endif @@ -1114,7 +1138,7 @@ struct k_itimer *timer; long flags; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) int error; retry_delete: #endif @@ -1122,7 +1146,7 @@ if (!timer) return -EINVAL; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) error = timer_delete_hook(timer); if (error == TIMER_RETRY) { @@ -1155,17 +1179,18 @@ { unsigned long flags; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) int error; retry_delete: #endif spin_lock_irqsave(&timer->it_lock, flags); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) error = timer_delete_hook(timer); if (error == TIMER_RETRY) { unlock_timer(timer, flags); + wait_for_timer(timer); goto retry_delete; } #else @@ -1424,6 +1449,14 @@ list_del_init(&timr->it.real.abs_timer_entry); if (add_clockset_delta(timr, &new_wall_to) && del_timer(&timr->it.real.timer)) /* timer run yet? */ + /* + * Note that we only do this if the timer is/was + * in the list. If it happens to be active an + * not in the timer list, it must be in the call + * back function, we leave it to that code to do + * the right thing. I.e we do NOT need + * del_timer_sync() + */ add_timer(&timr->it.real.timer); list_add(&timr->it.real.abs_timer_entry, &abs_list.list); spin_unlock_irq(&abs_list.lock); Index: 2.6-8xx/kernel/rcupdate.c =================================================================== --- 2.6-8xx.orig/kernel/rcupdate.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/rcupdate.c 2005-06-16 13:45:08.000000000 -0300 @@ -61,9 +61,9 @@ }; static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = - {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; + {.lock = SPIN_LOCK_UNLOCKED(rcu_state.lock), .cpumask = CPU_MASK_NONE }; static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = - {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; + {.lock = SPIN_LOCK_UNLOCKED(rcu_bh_state.lock), .cpumask = CPU_MASK_NONE }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; @@ -480,3 +480,39 @@ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL_GPL(synchronize_rcu); EXPORT_SYMBOL(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ + +#ifdef CONFIG_PREEMPT_RCU + +void rcu_read_lock(void) +{ + if (current->rcu_read_lock_nesting++ == 0) { + current->rcu_data = &get_cpu_var(rcu_data); + atomic_inc(¤t->rcu_data->active_readers); + smp_mb__after_atomic_inc(); + put_cpu_var(rcu_data); + } +} +EXPORT_SYMBOL(rcu_read_lock); + +void rcu_read_unlock(void) +{ + int cpu; + + if (--current->rcu_read_lock_nesting == 0) { + atomic_dec(¤t->rcu_data->active_readers); + smp_mb__after_atomic_dec(); + /* + * Check whether we have reached quiescent state. + * Note! This is only for the local CPU, not for + * current->rcu_data's CPU [which typically is the + * current CPU, but may also be another CPU]. + */ + cpu = get_cpu(); + rcu_qsctr_inc(cpu); + put_cpu(); + } +} +EXPORT_SYMBOL(rcu_read_unlock); + +#endif + Index: 2.6-8xx/kernel/sched.c =================================================================== --- 2.6-8xx.orig/kernel/sched.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/sched.c 2005-06-16 13:45:08.000000000 -0300 @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar */ #include @@ -46,6 +48,7 @@ #include #include #include +#include #include #include @@ -185,6 +188,7 @@ typedef struct runqueue runqueue_t; struct prio_array { + runqueue_t *rq; unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; @@ -198,7 +202,7 @@ * acquire operations must be ordered by ascending &runqueue. */ struct runqueue { - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -206,6 +210,9 @@ */ unsigned long nr_running; #ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT + unsigned long rt_nr_running; +#endif unsigned long cpu_load; #endif unsigned long long nr_switches; @@ -269,11 +276,23 @@ #define cpu_curr(cpu) (cpu_rq(cpu)->curr) /* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + +/* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define _finish_arch_switch(rq, next) spin_unlock(&(rq)->lock) # define task_running(rq, p) ((rq)->curr == (p)) #endif @@ -288,7 +307,7 @@ struct runqueue *rq; repeat_lock_task: - local_irq_save(*flags); + raw_local_irq_save(*flags); rq = task_rq(p); spin_lock(&rq->lock); if (unlikely(rq != task_rq(p))) { @@ -407,7 +426,7 @@ { runqueue_t *rq; - local_irq_disable(); + raw_local_irq_disable(); rq = this_rq(); spin_lock(&rq->lock); @@ -536,6 +555,33 @@ #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ +int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled; + +__cacheline_aligned_in_smp atomic_t rt_overload; + +static inline void inc_rt_tasks(task_t *p, runqueue_t *rq) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (rt_task(p)) { + rq->rt_nr_running++; + if (rq->rt_nr_running == 2) + atomic_inc(&rt_overload); + } +#endif +} + +static inline void dec_rt_tasks(task_t *p, runqueue_t *rq) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (rt_task(p)) { + WARN_ON(!rq->rt_nr_running); + rq->rt_nr_running--; + if (rq->rt_nr_running == 1) + atomic_dec(&rt_overload); + } +#endif +} + /* * Adding/removing a task to/from a priority array: */ @@ -545,15 +591,21 @@ list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + dec_rt_tasks(p, array->rq); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { + if (p->flags & PF_DEAD) { + printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid); + dump_stack(); + } sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + inc_rt_tasks(p, array->rq); } /* @@ -587,13 +639,11 @@ * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __effective_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -604,22 +654,52 @@ return prio; } +static int effective_prio(task_t *p) +{ + if (rt_task(p)) + return p->prio; + return __effective_prio(p); +} + +static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) + __trace_start_sched_wakeup(p); +} + /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { + trace_special_pid(p->pid, p->prio, rq->nr_running); enqueue_task(p, rq->active); rq->nr_running++; } /* + * __activate_task_after - move a task to the runqueue, + * to execute after a specific task. + */ +static inline +void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq) +{ + // FIXME: to head rather? + list_add_tail(&p->run_list, &parent->run_list); + p->array = parent->array; + p->array->nr_active++; + rq->nr_running++; + inc_rt_tasks(p, rq); +} + +/* * __activate_idle_task - move idle task to the _front_ of runqueue. */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); rq->nr_running++; + WARN_ON(rt_task(p)); } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -681,7 +761,7 @@ } } - p->prio = effective_prio(p); + p->prio = p->normal_prio = effective_prio(p); } /* @@ -959,7 +1039,7 @@ * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t * p, unsigned int state, int sync, int mutex) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -971,6 +1051,13 @@ int new_cpu; #endif +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + if (rt_task(p)) + sync = 0; +#endif rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1071,6 +1158,16 @@ this_cpu = smp_processor_id(); cpu = task_cpu(p); + } else { + /* + * If a newly woken up RT task cannot preempt the + * current (RT) task then try to find another + * CPU it can preempt: + */ + if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) { + smp_send_reschedule_allbutself(); + rt_overload_wakeup++; + } } out_activate: @@ -1087,37 +1184,76 @@ /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) + * trigger a 'delayed preemption', if the woken up task will run on + * this cpu. Delayed preemption is guaranteed to happen upon + * return to userspace. */ - activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) + activate_task(p, rq, cpu == this_cpu); + if (TASK_PREEMPTS_CURR(p, rq)) { resched_task(rq->curr); + trace_start_sched_wakeup(p, rq); + } + } else { + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + set_tsk_need_resched_delayed(rq->curr); } + if (rq->curr && p && rq && _need_resched()) + trace_special_pid(p->pid, p->prio, rq->curr->prio); success = 1; out_running: - p->state = TASK_RUNNING; + if (mutex) + p->state = TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; out: - task_rq_unlock(rq, &flags); +#ifdef PREEMPT_DIRECT + spin_unlock(&rq->lock); + /* + * Common place where preemption is requested - if we can + * reschedule then do it here without enabling interrupts + * again (and lengthening latency): + */ + if (_need_resched() && !irqs_disabled_flags(flags) && !preempt_count()) + preempt_schedule_irq(); + raw_local_irq_restore(flags); +#else + spin_unlock_irqrestore(&rq->lock, flags); +#endif + /* no need to check for preempt here - we just handled it */ return success; } int fastcall wake_up_process(task_t * p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 0); + mcount(); + return ret; } EXPORT_SYMBOL(wake_up_process); +int fastcall wake_up_process_mutex(task_t * p) +{ + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 1); + mcount(); + return ret; +} + +EXPORT_SYMBOL(wake_up_process_mutex); + int fastcall wake_up_state(task_t *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); + mcount(); + return ret; } #ifdef CONFIG_SMP @@ -1158,7 +1294,7 @@ * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. */ - local_irq_disable(); + raw_local_irq_disable(); p->time_slice = (current->time_slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by @@ -1176,10 +1312,10 @@ current->time_slice = 1; preempt_disable(); scheduler_tick(); - local_irq_enable(); + raw_local_irq_enable(); preempt_enable(); } else - local_irq_enable(); + raw_local_irq_enable(); } /* @@ -1210,7 +1346,7 @@ p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->prio = effective_prio(p); + p->prio = p->normal_prio = effective_prio(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -1222,16 +1358,17 @@ if (unlikely(!current->array)) __activate_task(p, rq); else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; + p->prio = p->normal_prio = current->prio; + __activate_task_after(p, current, rq); } set_need_resched(); - } else + trace_start_sched_wakeup(p, rq); + } else { /* Run child last */ __activate_task(p, rq); + if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq)) + set_need_resched(); + } /* * We skip the following code due to cpu == this_cpu * @@ -1309,13 +1446,14 @@ * details.) */ static inline void finish_task_switch(task_t *prev) - __releases(rq->lock) + __releases(this_rq->lock) { - runqueue_t *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; + int this_cpu = smp_processor_id(); + runqueue_t *this_rq = cpu_rq(this_cpu); + struct mm_struct *mm = this_rq->prev_mm; unsigned long prev_task_flags; - rq->prev_mm = NULL; + this_rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". @@ -1329,11 +1467,28 @@ * Manfred Spraul */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + /* + * If we pushed an RT task off the runqueue, + * then kick other CPUs, they might run it: + */ + if (unlikely(rt_task(current) && prev->array && rt_task(prev))) { + rt_overload_schedule++; + smp_send_reschedule_allbutself(); + } +#endif + _finish_arch_switch(this_rq, prev); + + trace_stop_sched_switched(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_task_flags & PF_DEAD)) - put_task_struct(prev); + put_task_struct_delayed(prev); } /** @@ -1343,7 +1498,11 @@ asmlinkage void schedule_tail(task_t *prev) __releases(rq->lock) { + preempt_disable(); // TODO: move this to fork setup finish_task_switch(prev); + __preempt_enable_no_resched(); + raw_local_irq_enable(); + preempt_check_resched(); if (current->set_child_tid) put_user(current->pid, current->set_child_tid); @@ -1372,6 +1531,8 @@ rq->prev_mm = oldmm; } + trace_cmdline(); + /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -1412,6 +1573,21 @@ return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + return cpu_rq(cpu)->rt_nr_running; +#else + return 0; +#endif +} + + unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; @@ -1493,6 +1669,100 @@ } } +#ifdef CONFIG_PREEMPT_RT + +static task_t * pick_rt_task(runqueue_t *src_rq, int this_cpu) +{ + struct list_head *head, *curr; + prio_array_t *array; + task_t *tmp; + int idx; + + WARN_ON(!spin_is_locked(&src_rq->lock)); + /* + * Only consider the active array - we are looking for + * RT tasks. Must have 2 tasks at least: + */ + array = src_rq->active; + if (unlikely(array->nr_active < 2)) + return NULL; + + idx = sched_find_first_bit(array->bitmap); +next_in_bitmap: + /* + * Only non-RT tasks available - abort the search: + */ + if (idx >= MAX_RT_PRIO) + return NULL; + + head = array->queue + idx; + curr = head->next; +next_in_queue: + tmp = list_entry(curr, task_t, run_list); + /* + * Return the highest-prio non-running RT task (if task + * may run on this CPU): + */ + if (!task_running(src_rq, tmp) && + cpu_isset(this_cpu, tmp->cpus_allowed)) + return tmp; + + curr = curr->next; + if (curr != head) + goto next_in_queue; + + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + goto next_in_bitmap; +} + +/* + * Pull RT tasks from other CPUs in the RT-overload + * case. Interrupts are disabled, local rq is locked. + */ +static void pull_rt_tasks(runqueue_t *this_rq, int this_cpu) +{ + runqueue_t *src_rq; + task_t *p; + int cpu; + + WARN_ON(!raw_irqs_disabled()); + + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + src_rq = cpu_rq(cpu); + if (src_rq->rt_nr_running <= 1) + continue; + + double_lock_balance(this_rq, src_rq); + + p = pick_rt_task(src_rq, this_cpu); + + if (p /* && TASK_PREEMPTS_CURR(p, this_rq) */ ) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->array); + rt_overload_pulled++; + + set_task_cpu(p, this_cpu); + + p->timestamp = p->timestamp - + src_rq->timestamp_last_tick + + this_rq->timestamp_last_tick; + deactivate_task(p, src_rq); + activate_task(p, this_rq, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. + */ + } + spin_unlock(&src_rq->lock); + } +} + +#endif + + /* * find_idlest_cpu - find the least busy runqueue. */ @@ -2246,10 +2516,11 @@ { unsigned long long ns; unsigned long flags; - local_irq_save(flags); + + raw_local_irq_save(flags); ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); ns = tsk->sched_time + (sched_clock() - ns); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ns; } @@ -2358,6 +2629,8 @@ task_t *p = current; unsigned long long now = sched_clock(); + BUG_ON(!raw_irqs_disabled()); + update_cpu_clock(p, rq, now); rq->timestamp_last_tick = now; @@ -2381,6 +2654,8 @@ * priority until it either goes to sleep or uses up its * timeslice. This makes it possible for interactive tasks * to use up their timeslices at their highest priority levels. + * + * Priority-boosted SCHED_NORMAL tasks may go here too. */ if (rt_task(p)) { /* @@ -2400,7 +2675,7 @@ if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = effective_prio(p); + p->prio = p->normal_prio = effective_prio(p); p->time_slice = task_timeslice(p); p->first_time_slice = 0; @@ -2569,42 +2844,51 @@ } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_RT_DEADLOCK_DETECT) -void fastcall add_preempt_count(int val) +static void trace_array(prio_array_t *array) { - /* - * Underflow? - */ - BUG_ON(((int)preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + int i; + task_t *p; + struct list_head *head, *tmp; + + for (i = 0; i < MAX_PRIO; i++) { + head = array->queue + i; + if (list_empty(head)) { + WARN_ON(test_bit(i, array->bitmap)); + continue; + } + WARN_ON(!test_bit(i, array->bitmap)); + list_for_each(tmp, head) { + p = list_entry(tmp, task_t, run_list); + trace_special_pid(p->pid, p->prio, + p->policy == SCHED_NORMAL ? + p->static_prio : + (MAX_RT_PRIO-1) - p->rt_priority); + } + } } -EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +static inline void trace_all_runnable_tasks(runqueue_t *rq) +{ + if (trace_enabled) { + trace_array(rq->active); + trace_array(rq->expired); + } +} + +#else + +static inline void trace_all_runnable_tasks(runqueue_t *rq) { - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; } -EXPORT_SYMBOL(sub_preempt_count); #endif /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +void __sched __schedule(void) { long *switch_count; task_t *prev, *next; @@ -2615,26 +2899,24 @@ unsigned long run_time; int cpu, idx; + WARN_ON(system_state == SYSTEM_BOOTING); /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. - */ - if (likely(!current->exit_state)) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + * Test if we are atomic. + */ + if (unlikely(in_atomic())) { + stop_trace(); + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -need_resched: - preempt_disable(); + preempt_disable(); // FIXME: disable irqs here prev = current; release_kernel_lock(prev); -need_resched_nonpreemptible: rq = this_rq(); /* @@ -2642,7 +2924,7 @@ * Remove this check after it has been exercised a bit. */ if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + printk(KERN_ERR "BUG: scheduling from the idle thread!\n"); dump_stack(); } @@ -2661,13 +2943,12 @@ */ run_time /= (CURRENT_BONUS(prev) ? : 1); + cpu = smp_processor_id(); spin_lock_irq(&rq->lock); - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; - - switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat + if ((prev->state & ~TASK_RUNNING_MUTEX) && + !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) @@ -2678,8 +2959,23 @@ deactivate_task(prev, rq); } } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + if (unlikely(prev->flags & PF_DEAD)) { + if (prev->state != TASK_RUNNING) { + printk("prev->state: %ld != TASK_RUNNING??\n", + prev->state); + WARN_ON(1); + } else + deactivate_task(prev, rq); + prev->state = EXIT_DEAD; + } + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (unlikely(atomic_read(&rt_overload))) + pull_rt_tasks(rq, cpu); +#endif - cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { go_idle: idle_balance(cpu, rq); @@ -2745,6 +3041,7 @@ schedstat_inc(rq, sched_goidle); prefetch(next); clear_tsk_need_resched(prev); + clear_tsk_need_resched_delayed(prev); rcu_qsctr_inc(task_cpu(prev)); update_cpu_clock(prev, rq, now); @@ -2754,6 +3051,8 @@ prev->sleep_avg = 0; prev->timestamp = prev->last_ran = now; + trace_all_runnable_tasks(rq); + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; @@ -2764,22 +3063,98 @@ prepare_arch_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); - + if (prev && current) + trace_special_pid(prev->pid, prev->prio, current->prio); finish_task_switch(prev); - } else - spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + } else { + trace_stop_sched_switched(next); + __preempt_enable_no_resched(); + spin_unlock(&rq->lock); + } - prev = current; - if (unlikely(reacquire_kernel_lock(prev) < 0)) - goto need_resched_nonpreemptible; - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + reacquire_kernel_lock(current); +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + WARN_ON(system_state == SYSTEM_BOOTING); + /* + * Test if we have interrupts disabled. + */ + if (unlikely(irqs_disabled() || raw_irqs_disabled())) { + stop_trace(); + printk(KERN_ERR "BUG: scheduling with irqs disabled: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); + } + if (unlikely(current->flags & PF_NOSCHED)) { + current->flags &= ~PF_NOSCHED; + printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid); + dump_stack(); + send_sig(SIGUSR2, current, 1); + } + do { + __schedule(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED))); + raw_local_irq_enable(); // TODO: do sti; ret } EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT + +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk("turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk("turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + + +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + static int once = 1; + + barrier(); + dec_preempt_count(); + + if (once && !preempt_count()) { + once = 0; + printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); + dump_stack(); + } +} + +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + /* * this is is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -2792,14 +3167,17 @@ struct task_struct *task = current; int saved_lock_depth; #endif + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (unlikely(ti->preempt_count || irqs_disabled())) + if (unlikely(ti->preempt_count || irqs_disabled() || raw_irqs_disabled())) return; need_resched: + raw_local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* * We keep the big kernel semaphore locked, but we @@ -2810,25 +3188,24 @@ saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - schedule(); + __schedule(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED))) goto need_resched; + raw_local_irq_enable(); } EXPORT_SYMBOL(preempt_schedule); /* - * this is is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { @@ -2837,10 +3214,17 @@ struct task_struct *task = current; int saved_lock_depth; #endif - /* Catch callers which need to be fixed*/ - BUG_ON(ti->preempt_count || !irqs_disabled()); + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; need_resched: + raw_local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* * We keep the big kernel semaphore locked, but we @@ -2851,17 +3235,19 @@ saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); - local_irq_disable(); + __schedule(); + + raw_local_irq_disable(); +#ifdef CONFIG_PREEMPT_RT + local_irq_enable_noresched(); +#endif + #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED))) goto need_resched; } @@ -2870,7 +3256,7 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { task_t *p = curr->task; - return try_to_wake_up(p, mode, sync); + return try_to_wake_up(p, mode | TASK_RUNNING_MUTEX, sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -2982,6 +3368,13 @@ } EXPORT_SYMBOL(complete_all); +unsigned int fastcall completion_done(struct completion *x) +{ + return x->done; +} +EXPORT_SYMBOL(completion_done); + + void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); @@ -3237,6 +3630,65 @@ capable(CAP_SYS_NICE)); } +int mutex_getprio(task_t *p) +{ + int prio; + + if (p->policy != SCHED_NORMAL) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __effective_prio(p); + trace_special_pid(p->pid, p->prio, prio); + return prio; +} + +/* + * Used by the PREEMPT_RT code to implement + * priority inheritance logic: + */ +void mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio, prev_resched; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + trace_special_pid(p->pid, oldprio, prio); + prev_resched = _need_resched(); + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + trace_special(prev_resched, _need_resched(), 0); + + task_rq_unlock(rq, &flags); +} + #ifdef __ARCH_WANT_SYS_NICE /* @@ -3347,9 +3799,9 @@ p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->prio = p->normal_prio = MAX_RT_PRIO-1 - p->rt_priority; else - p->prio = p->static_prio; + p->prio = p->normal_prio = p->static_prio; } /** @@ -3379,7 +3831,8 @@ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - param->sched_priority > MAX_USER_RT_PRIO-1) + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; @@ -3717,21 +4170,28 @@ * no need to preempt or enable interrupts: */ __release(rq->lock); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + __raw_spin_unlock(&rq->lock); + __preempt_enable_no_resched(); - schedule(); + __schedule(); + raw_local_irq_enable(); + preempt_check_resched(); return 0; } -static inline void __cond_resched(void) +static void __cond_resched(void) { + if (system_state == SYSTEM_BOOTING || !current->pid) + return; + if (preempt_count() & PREEMPT_ACTIVE) + return; do { + raw_local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + raw_local_irq_enable(); } int __sched cond_resched(void) @@ -3753,40 +4213,125 @@ * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int __cond_resched_raw_spinlock(raw_spinlock_t *lock) { + int ret = 0; + if (need_lockbreak(lock)) { spin_unlock(lock); cpu_relax(); spin_lock(lock); + ret = 1; } if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); + __raw_spin_unlock(lock); + __preempt_enable_no_resched(); __cond_resched(); spin_lock(lock); + ret = 1; + } + return ret; +} + +EXPORT_SYMBOL(__cond_resched_raw_spinlock); + +#ifdef CONFIG_PREEMPT_RT + +int __cond_resched_spinlock(spinlock_t *lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + _spin_unlock(lock); + __cond_resched(); + _spin_lock(lock); return 1; } +#endif return 0; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_spinlock); + +#endif + +/* + * Preempt a softirq context if necessary: + */ int __sched cond_resched_softirq(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(!in_softirq()); - if (need_resched()) { + if (softirq_need_resched()) { __local_bh_enable(); __cond_resched(); local_bh_disable(); return 1; } +#endif return 0; } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Preempt a hardirq context if necessary: + */ +int cond_resched_hardirq(void) +{ + unsigned long flags; + + BUG_ON(!in_irq()); + if (hardirq_need_resched()) { + local_save_flags(flags); + irq_exit(); + __cond_resched(); + local_irq_restore(flags); + irq_enter(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_hardirq); + +/* + * Preempt any context: + */ +int cond_resched_all(void) +{ + if (hardirq_count()) + return cond_resched_hardirq(); + if (softirq_count()) + return cond_resched_softirq(); + return cond_resched(); +} + +EXPORT_SYMBOL(cond_resched_all); + +#ifdef CONFIG_PREEMPT_VOLUNTARY + +int voluntary_preemption = 1; + +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif /** * yield - yield the current processor to other threads. @@ -3794,12 +4339,31 @@ * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} + EXPORT_SYMBOL(yield); /* @@ -3938,25 +4502,29 @@ task_t *relative; unsigned state; unsigned long free = 0; - static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + static const char *stat_nam[] = { "R", "M", "S", "D", "T", "t", "Z", "X" }; - printk("%-13.13s ", p->comm); + printk("%-13.13s [%p]", p->comm, p); state = p->state ? __ffs(p->state) + 1 : 0; if (state < ARRAY_SIZE(stat_nam)) printk(stat_nam[state]); else printk("?"); #if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(" running "); else printk(" %08lX ", thread_saved_pc(p)); #else - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif + if (task_curr(p)) + printk("[curr] "); + else if (p->array) + printk("[on rq] "); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long * n = (unsigned long *) (p->thread_info+1); @@ -3983,13 +4551,14 @@ else printk(" (NOTLB)\n"); - if (state != TASK_RUNNING) +// if (state != TASK_RUNNING) show_stack(p, NULL); } void show_state(void) { task_t *g, *p; + int do_unlock = 1; #if (BITS_PER_LONG == 32) printk("\n" @@ -4000,7 +4569,16 @@ " sibling\n"); printk(" task PC pid father child younger older\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -4010,7 +4588,9 @@ show_task(p); } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); + show_all_locks(); } void __devinit init_idle(task_t *idle, int cpu) @@ -4031,7 +4611,9 @@ spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) idle->thread_info->preempt_count = (idle->lock_depth >= 0); #else idle->thread_info->preempt_count = 0; @@ -4115,12 +4697,13 @@ * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { runqueue_t *rq_dest, *rq_src; + int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) - return; + return 0; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -4133,7 +4716,9 @@ if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + WARN_ON(p == rq_src->curr); set_task_cpu(p, dest_cpu); + if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -4147,10 +4732,13 @@ activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); + ret = 1; } out: double_rq_unlock(rq_src, rq_dest); + + return ret; } /* @@ -4200,7 +4788,7 @@ if (req->type == REQ_MOVE_TASK) { spin_unlock(&rq->lock); __migrate_task(req->task, cpu, req->dest_cpu); - local_irq_enable(); + raw_local_irq_enable(); } else if (req->type == REQ_SET_DOMAIN) { rq->sd = req->sd; spin_unlock_irq(&rq->lock); @@ -4271,12 +4859,12 @@ runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); double_rq_lock(rq_src, rq_dest); rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; rq_src->nr_uninterruptible = 0; double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* Run through task list and migrate tasks from the dead cpu. */ @@ -4354,9 +4942,9 @@ * that's OK. No task can be added to this CPU, so iteration is * fine. */ - spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, tsk); spin_lock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, tsk); + spin_unlock_irq(&rq->lock); put_task_struct(tsk); } @@ -4940,6 +5528,7 @@ for (j = 0; j < 2; j++) { array = rq->arrays + j; + array->rq = rq; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); @@ -4955,6 +5544,9 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (c) Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -4964,7 +5556,7 @@ init_idle(current, smp_processor_id()); } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line) { #if defined(in_atomic) @@ -4972,13 +5564,17 @@ if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (debug_direct_keyboard && hardirq_count()) + return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + stop_trace(); + printk(KERN_ERR "BUG: sleeping function called from invalid" + " context %s(%d) at %s:%d\n", + current->comm, current->pid, file, line); + printk("in_atomic():%d [%08x], irqs_disabled():%d\n", + in_atomic(), preempt_count(), irqs_disabled()); dump_stack(); } #endif Index: 2.6-8xx/kernel/fork.c =================================================================== --- 2.6-8xx.orig/kernel/fork.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/fork.c 2005-06-16 13:45:08.000000000 -0300 @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include @@ -63,6 +65,16 @@ EXPORT_SYMBOL(tasklist_lock); +/* + * Delayed mmdrop/put_task_struct. In the PREEMPT_RT case we + * dont want to do this from the scheduling context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); + +static DEFINE_PER_CPU(struct list_head, delayed_put_list); +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + + int nr_processes(void) { int cpu; @@ -107,6 +119,8 @@ void __put_task_struct(struct task_struct *tsk) { + BUG_ON(atomic_read(&tsk->usage)); + WARN_ON(!(tsk->flags & PF_DEAD)); WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); @@ -121,8 +135,29 @@ free_task(tsk); } +void put_task_struct(struct task_struct *tsk) +{ + BUG_ON(!atomic_read(&tsk->usage)); + + if (!atomic_dec_and_test(&tsk->usage)) + return; + __put_task_struct(tsk); +} + +EXPORT_SYMBOL(put_task_struct); + +void get_task_struct(struct task_struct *tsk) +{ + BUG_ON(!atomic_read(&tsk->usage)); + atomic_inc(&tsk->usage); +} + +EXPORT_SYMBOL(get_task_struct); + void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -150,6 +185,11 @@ init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) { + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); + INIT_LIST_HEAD(&per_cpu(delayed_put_list, i)); + } } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -321,6 +361,7 @@ rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = TASK_UNMAPPED_BASE; if (likely(!mm_alloc_pgd(mm))) { @@ -948,6 +989,11 @@ goto bad_fork_cleanup; } #endif + INIT_LIST_HEAD(&p->delayed_put); + preempt_disable(); + plist_init(&p->pi_waiters, MAX_PRIO); + preempt_enable(); + p->blocked_on = NULL; /* not blocked yet */ p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1021,8 +1067,10 @@ * another CPU - so we re-copy it here and set the child's CPU to * the parent's CPU. This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* * Check for pending SIGKILL! The new thread should not be allowed @@ -1095,8 +1143,11 @@ if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); - if (p->pid) + if (p->pid) { + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); + } } nr_threads++; @@ -1272,3 +1323,173 @@ sizeof(struct mm_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); } + +static int put_task_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_put_list); + while (!list_empty(head)) { + struct task_struct *task = list_entry(head->next, + struct task_struct, delayed_put); + list_del(&task->delayed_put); + put_cpu_var(delayed_put_list); + + __put_task_struct(task); + ret = 1; + + head = &get_cpu_var(delayed_put_list); + } + put_cpu_var(delayed_put_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __put_task_struct_delayed(struct task_struct *task) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_put_list); + list_add_tail(&task->delayed_put, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_put_list); +} + +void put_task_struct_delayed(struct task_struct *tsk) +{ + BUG_ON(!atomic_read(&tsk->usage)); + + if (!atomic_dec_and_test(&tsk->usage)) + return; + __put_task_struct_delayed(tsk); +} + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static int desched_thread(void * __bind_cpu) +{ + printk("desched thread %ld started up.\n", (long) __bind_cpu); + + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + int ret; + + ret = put_task_complete(); + ret |= mmdrop_complete(); + if (ret) + continue; + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + printk("desched cpu_callback %ld/%p\n", action, hcpu); + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_put_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + + printk("spawn_desched_task(%p)\n", cpu); + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + Index: 2.6-8xx/kernel/irq/proc.c =================================================================== --- 2.6-8xx.orig/kernel/irq/proc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/irq/proc.c 2005-06-16 13:45:08.000000000 -0300 @@ -7,9 +7,12 @@ */ #include +#include #include #include +#include "internals.h" + static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; #ifdef CONFIG_SMP @@ -67,37 +70,6 @@ #endif -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - - for (action = desc->action ; action; action = action->next) - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) - return 0; - return 1; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - - if (!irq_dir[irq] || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_dir[irq]); -} - -#undef MAX_NAMELEN - #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq) @@ -137,10 +109,96 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) { + if (action->threaded) + remove_proc_entry(action->threaded->name, action->dir); if (action->dir) remove_proc_entry(action->dir->name, irq_dir[irq]); } +#ifndef CONFIG_PREEMPT_RT + +static int threaded_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "%c\n", + ((struct irqaction *)data)->flags & SA_NODELAY ? '0' : '1'); +} + +static int threaded_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + int c; + struct irqaction *action = data; + irq_desc_t *desc = irq_desc + action->irq; + + if (get_user(c, buffer)) + return -EFAULT; + if (c != '0' && c != '1') + return -EINVAL; + + spin_lock_irq(&desc->lock); + + if (c == '0') + action->flags |= SA_NODELAY; + if (c == '1') + action->flags &= ~SA_NODELAY; + recalculate_desc_flags(desc); + + spin_unlock_irq(&desc->lock); + + return 1; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_dir[irq] || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_dir[irq]); + if (!action->dir) + return; +#ifndef CONFIG_PREEMPT_RT + { + struct proc_dir_entry *entry; + /* create /proc/irq/1234/handler/threaded */ + entry = create_proc_entry("threaded", 0600, action->dir); + if (!entry) + return; + entry->nlink = 1; + entry->data = (void *)action; + entry->read_proc = threaded_read_proc; + entry->write_proc = threaded_write_proc; + action->threaded = entry; + } +#endif +} + +#undef MAX_NAMELEN + + void init_irq_proc(void) { int i; @@ -150,6 +208,9 @@ if (!root_irq_dir) return; + /* create /proc/irq/prof_cpu_mask */ + create_prof_cpu_mask(root_irq_dir); + /* * Create entries for all existing IRQs. */ Index: 2.6-8xx/kernel/irq/manage.c =================================================================== --- 2.6-8xx.orig/kernel/irq/manage.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/irq/manage.c 2005-06-16 13:45:08.000000000 -0300 @@ -7,8 +7,10 @@ */ #include -#include #include +#include +#include +#include #include #include "internals.h" @@ -30,8 +32,12 @@ { struct irq_desc *desc = irq_desc + irq; - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) + wait_event(desc->wait_for_handler, + !(desc->status & IRQ_INPROGRESS)); + else + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); } EXPORT_SYMBOL(synchronize_irq); @@ -53,13 +59,15 @@ { irq_desc_t *desc = irq_desc + irq; unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); + + __raw_local_irq_save(flags); + spin_lock(&desc->lock); if (!desc->depth++) { desc->status |= IRQ_DISABLED; desc->handler->disable(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + spin_unlock(&desc->lock); + __raw_local_irq_restore(flags); } EXPORT_SYMBOL(disable_irq_nosync); @@ -102,7 +110,8 @@ irq_desc_t *desc = irq_desc + irq; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + __raw_local_irq_save(flags); + spin_lock(&desc->lock); switch (desc->depth) { case 0: WARN_ON(1); @@ -121,12 +130,28 @@ default: desc->depth--; } - spin_unlock_irqrestore(&desc->lock, flags); + spin_unlock(&desc->lock); + __raw_local_irq_restore(flags); } EXPORT_SYMBOL(enable_irq); /* + * If any action has SA_NODELAY then turn IRQ_NODELAY on: + */ +void recalculate_desc_flags(struct irq_desc *desc) +{ + struct irqaction *action; + + desc->status &= ~IRQ_NODELAY; + for (action = desc->action ; action; action = action->next) + if (action->flags & SA_NODELAY) + desc->status |= IRQ_NODELAY; +} + +static int start_irq_thread(int irq, struct irq_desc *desc); + +/* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. @@ -176,10 +201,14 @@ rand_initialize_irq(irq); } + if (!(new->flags & SA_NODELAY)) + if (start_irq_thread(irq, desc)) + return -ENOMEM; /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock,flags); + __raw_local_irq_save(flags); + spin_lock(&desc->lock); p = &desc->action; if ((old = *p) != NULL) { /* Can't share interrupts unless both agree to */ @@ -198,6 +227,11 @@ *p = new; + /* + * Propagate any possible SA_NODELAY flag into IRQ_NODELAY: + */ + recalculate_desc_flags(desc); + if (!shared) { desc->depth = 0; desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | @@ -207,11 +241,12 @@ else desc->handler->enable(irq); } - spin_unlock_irqrestore(&desc->lock,flags); + spin_unlock(&desc->lock); + __raw_local_irq_restore(flags); new->irq = irq; register_irq_proc(irq); - new->dir = NULL; + new->dir = new->threaded = NULL; register_handler_proc(irq, new); return 0; @@ -241,7 +276,8 @@ return; desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); + __raw_local_irq_save(flags); + spin_lock(&desc->lock); p = &desc->action; for (;;) { struct irqaction * action = *p; @@ -262,7 +298,9 @@ else desc->handler->disable(irq); } - spin_unlock_irqrestore(&desc->lock,flags); + recalculate_desc_flags(desc); + spin_unlock(&desc->lock); + __raw_local_irq_restore(flags); unregister_handler_proc(irq, action); /* Make sure it's not being used on another CPU */ @@ -271,7 +309,8 @@ return; } printk(KERN_ERR "Trying to free free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); + spin_unlock(&desc->lock); + __raw_local_irq_restore(flags); return; } } @@ -347,3 +386,174 @@ EXPORT_SYMBOL(request_irq); +#ifdef CONFIG_PREEMPT_HARDIRQS + +int hardirq_preemption = 1; + +EXPORT_SYMBOL(hardirq_preemption); + +/* + * Real-Time Preemption depends on hardirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init hardirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + hardirq_preemption = 0; + else + get_option(&str, &hardirq_preemption); + if (!hardirq_preemption) + printk("turning off hardirq preemption!\n"); + + return 1; +} + +__setup("hardirq-preempt=", hardirq_preempt_setup); + +#endif + +static void do_hardirq(struct irq_desc *desc) +{ + struct irqaction * action; + unsigned int irq = desc - irq_desc; + + raw_local_irq_disable(); + + if (desc->status & IRQ_INPROGRESS) { + action = desc->action; + spin_lock(&desc->lock); + for (;;) { + irqreturn_t action_ret = 0; + + if (action) { + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, NULL,action); + raw_local_irq_enable(); + cond_resched_all(); + raw_local_irq_disable(); + spin_lock(&desc->lock); + } + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + if (likely(!(desc->status & IRQ_PENDING))) + break; + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + /* + * The ->end() handler has to deal with interrupts which got + * disabled while the handler was running. + */ + desc->handler->end(irq); + if (!(desc->status & IRQ_DISABLED)) + desc->handler->enable(irq); + spin_unlock(&desc->lock); + } + raw_local_irq_enable(); + if (waitqueue_active(&desc->wait_for_handler)) + wake_up(&desc->wait_for_handler); +} + +extern asmlinkage void __do_softirq(void); + +static int curr_irq_prio = 49; + +static int do_irqd(void * __desc) +{ + struct sched_param param = { 0, }; + struct irq_desc *desc = __desc; +#ifdef CONFIG_SMP + int irq = desc - irq_desc; + cpumask_t mask; + + mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq])); + set_cpus_allowed(current, mask); +#endif + current->flags |= PF_NOFREEZE | PF_HARDIRQ; + + /* + * Scale irq thread priorities from prio 50 to prio 25 + */ + param.sched_priority = curr_irq_prio; + if (param.sched_priority > 25) + curr_irq_prio = param.sched_priority - 1; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + do_hardirq(desc); + cond_resched_all(); + __do_softirq(); + raw_local_irq_enable(); +#ifdef CONFIG_SMP + /* + * Did IRQ affinities change? + */ + if (!cpu_isset(smp_processor_id(), irq_affinity[irq])) { + mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq])); + set_cpus_allowed(current, mask); + } +#endif + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int ok_to_create_irq_threads; + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + if (desc->thread || !ok_to_create_irq_threads) + return 0; + + desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq); + if (!desc->thread) { + printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); + return -ENOMEM; + } + + /* + * An interrupt may have come in before the thread pointer was + * stored in desc->thread; make sure the thread gets woken up in + * such a case: + */ + smp_mb(); + wake_up_process(desc->thread); + + return 0; +} + +void __init init_hardirqs(void) +{ + int i; + ok_to_create_irq_threads = 1; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + + if (desc->action && !(desc->status & IRQ_NODELAY)) + start_irq_thread(i, desc); + } +} + +#else + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + return 0; +} + +#endif + +void __init early_init_hardirqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + init_waitqueue_head(&irq_desc[i].wait_for_handler); +} + + Index: 2.6-8xx/kernel/irq/spurious.c =================================================================== --- 2.6-8xx.orig/kernel/irq/spurious.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/irq/spurious.c 2005-06-16 13:45:08.000000000 -0300 @@ -10,6 +10,10 @@ #include #include #include +#ifdef CONFIG_X86_IO_APIC +# include +# include +#endif /* * If 99,900 of the previous 100,000 interrupts have not been handled @@ -73,12 +77,16 @@ * The interrupt is stuck */ __report_bad_irq(irq, desc, action_ret); +#ifdef CONFIG_X86_IO_APIC + sis_apic_bug = 1; +#else /* * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED; desc->handler->disable(irq); +#endif } desc->irqs_unhandled = 0; } Index: 2.6-8xx/kernel/irq/handle.c =================================================================== --- 2.6-8xx.orig/kernel/irq/handle.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/irq/handle.c 2005-06-16 13:45:08.000000000 -0300 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -32,10 +33,12 @@ [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .handler = &no_irq_type, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED } }; +EXPORT_SYMBOL_GPL(irq_desc); + /* * Generic 'no controller' code */ @@ -74,6 +77,32 @@ } /* + * Hack - used for development only. + */ +int debug_direct_keyboard = 0; + +int redirect_hardirq(struct irq_desc *desc) +{ + /* + * Direct execution: + */ + if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || + !desc->thread) + return 0; + +#ifdef __i386__ + if (debug_direct_keyboard && (desc - irq_desc == 1)) + return 0; +#endif + + BUG_ON(!irqs_disabled()); + if (desc->thread && desc->thread->state != TASK_RUNNING) + wake_up_process(desc->thread); + + return 1; +} + +/* * Have got an event to handle: */ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, @@ -81,35 +110,62 @@ { int ret, retval = 0, status = 0; - if (!(action->flags & SA_INTERRUPT)) - local_irq_enable(); + /* + * Unconditionally enable interrupts for threaded + * IRQ handlers: + */ + if (!hardirq_count() || !(action->flags & SA_INTERRUPT)) + raw_local_irq_enable(); do { + unsigned int preempt_count = preempt_count(); + ret = action->handler(irq, action->dev_id, regs); + if (preempt_count() != preempt_count) { + stop_trace(); + print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + dump_stack(); + preempt_count() = preempt_count; + } if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; action = action->next; } while (action); - if (status & SA_SAMPLE_RANDOM) + if (status & SA_SAMPLE_RANDOM) { + raw_local_irq_enable(); add_interrupt_randomness(irq); - local_irq_disable(); + } + raw_local_irq_disable(); return retval; } +cycles_t irq_timestamp(unsigned int irq) +{ + return irq_desc[irq].timestamp; +} + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) +fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) { irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + /* + * Disable the soft-irq-flag: + */ + local_irq_save(flags); +#endif kstat_this_cpu.irqs[irq]++; if (desc->status & IRQ_PER_CPU) { irqreturn_t action_ret; @@ -122,6 +178,7 @@ desc->handler->end(irq); return 1; } + desc->timestamp = get_cycles(); spin_lock(&desc->lock); desc->handler->ack(irq); @@ -154,6 +211,14 @@ goto out; /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) { + desc->handler->disable(irq); + goto out_no_end; + } + + /* * Edge triggered interrupts need to remember * pending events. * This applies to any hw interrupts that allow a second @@ -178,15 +243,22 @@ desc->status &= ~IRQ_PENDING; } desc->status &= ~IRQ_INPROGRESS; - out: /* * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ desc->handler->end(irq); +out_no_end: spin_unlock(&desc->lock); - +#ifdef CONFIG_PREEMPT_RT + /* re-enable interrupts to break an IRQ latency path: */ + raw_local_irq_enable(); + /* restore the soft IRQ-flag: */ + local_irq_restore(flags); + /* re-disable interrupts because callers expect irqs off: */ + raw_local_irq_disable(); +#endif return 1; } Index: 2.6-8xx/kernel/irq/autoprobe.c =================================================================== --- 2.6-8xx.orig/kernel/irq/autoprobe.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/irq/autoprobe.c 2005-06-16 13:45:08.000000000 -0300 @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -26,7 +27,7 @@ */ unsigned long probe_irq_on(void) { - unsigned long val, delay; + unsigned long val; irq_desc_t *desc; unsigned int i; @@ -38,15 +39,18 @@ for (i = NR_IRQS-1; i > 0; i--) { desc = irq_desc + i; - spin_lock_irq(&desc->lock); + raw_local_irq_disable(); + spin_lock(&desc->lock); if (!irq_desc[i].action) irq_desc[i].handler->startup(i); - spin_unlock_irq(&desc->lock); + spin_unlock(&desc->lock); + raw_local_irq_enable(); } - /* Wait for longstanding interrupts to trigger. */ - for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) - /* about 20ms delay */ barrier(); + /* + * Wait for longstanding interrupts to trigger, 20 msec delay: + */ + msleep(20); /* * enable any unassigned irqs @@ -56,20 +60,21 @@ for (i = NR_IRQS-1; i > 0; i--) { desc = irq_desc + i; - spin_lock_irq(&desc->lock); + raw_local_irq_disable(); + spin_lock(&desc->lock); if (!desc->action) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; if (desc->handler->startup(i)) desc->status |= IRQ_PENDING; } - spin_unlock_irq(&desc->lock); + spin_unlock(&desc->lock); + raw_local_irq_enable(); } /* - * Wait for spurious interrupts to trigger + * Wait for spurious interrupts to trigger, 100 msec delay: */ - for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) - /* about 100ms delay */ barrier(); + msleep(100); /* * Now filter out any obviously spurious interrupts @@ -79,7 +84,8 @@ irq_desc_t *desc = irq_desc + i; unsigned int status; - spin_lock_irq(&desc->lock); + raw_local_irq_disable(); + spin_lock(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -91,7 +97,8 @@ if (i < 32) val |= 1 << i; } - spin_unlock_irq(&desc->lock); + spin_unlock(&desc->lock); + raw_local_irq_enable(); } return val; @@ -121,7 +128,8 @@ irq_desc_t *desc = irq_desc + i; unsigned int status; - spin_lock_irq(&desc->lock); + raw_local_irq_disable(); + spin_lock(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -131,7 +139,8 @@ desc->status = status & ~IRQ_AUTODETECT; desc->handler->shutdown(i); } - spin_unlock_irq(&desc->lock); + spin_unlock(&desc->lock); + raw_local_irq_enable(); } up(&probe_sem); @@ -164,7 +173,8 @@ irq_desc_t *desc = irq_desc + i; unsigned int status; - spin_lock_irq(&desc->lock); + raw_local_irq_disable(); + spin_lock(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -176,7 +186,8 @@ desc->status = status & ~IRQ_AUTODETECT; desc->handler->shutdown(i); } - spin_unlock_irq(&desc->lock); + spin_unlock(&desc->lock); + raw_local_irq_enable(); } up(&probe_sem); Index: 2.6-8xx/kernel/irq/internals.h =================================================================== --- 2.6-8xx.orig/kernel/irq/internals.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/irq/internals.h 2005-06-16 13:45:08.000000000 -0300 @@ -4,6 +4,8 @@ extern int noirqdebug; +void recalculate_desc_flags(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); Index: 2.6-8xx/kernel/signal.c =================================================================== --- 2.6-8xx.orig/kernel/signal.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/signal.c 2005-06-16 13:45:08.000000000 -0300 @@ -872,9 +872,12 @@ { int ret = 0; - if (!irqs_disabled()) - BUG(); +#ifndef CONFIG_PREEMPT_RT + BUG_ON(!irqs_disabled()); +#endif +#ifdef CONFIG_SMP assert_spin_locked(&t->sighand->siglock); +#endif if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) /* @@ -1615,6 +1618,7 @@ do_notify_parent_cldstop(current, current->parent, CLD_TRAPPED); read_unlock(&tasklist_lock); + current->flags &= ~PF_NOSCHED; schedule(); } else { /* @@ -1681,6 +1685,7 @@ read_unlock(&tasklist_lock); } + current->flags &= ~PF_NOSCHED; schedule(); /* * Now we don't run again until continued. @@ -1839,6 +1844,9 @@ sigset_t *mask = ¤t->blocked; int signr = 0; +#ifdef CONFIG_PREEMPT_RT + might_sleep(); +#endif relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { Index: 2.6-8xx/kernel/workqueue.c =================================================================== --- 2.6-8xx.orig/kernel/workqueue.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/workqueue.c 2005-06-16 13:45:08.000000000 -0300 @@ -25,6 +25,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use cpu 0's). @@ -93,10 +94,12 @@ * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. + * + * Especially no such guarantee on PREEMPT_RT. */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0, cpu = get_cpu(); + int ret = 0, cpu = _smp_processor_id(); if (!test_and_set_bit(0, &work->pending)) { if (unlikely(is_single_threaded(wq))) @@ -105,7 +108,6 @@ __queue_work(wq->cpu_wq + cpu, work); ret = 1; } - put_cpu(); return ret; } @@ -365,6 +367,39 @@ kthread_stop(p); } +void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, + int policy, int rt_priority, int nice) +{ + struct task_struct *p = wq->cpu_wq[cpu].thread; + struct sched_param param = { .sched_priority = rt_priority }; + int ret; + + set_user_nice(p, nice); + ret = sys_sched_setscheduler(p->pid, policy, ¶m); + if (ret) + printk("BUG: wq(%s) setscheduler() returned: %d.\n", + wq->name, ret); + +} + +void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice) +{ + int cpu; + + /* We don't need the distraction of CPUs appearing and vanishing. */ + lock_cpu_hotplug(); + if (is_single_threaded(wq)) + set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); + else { + for_each_online_cpu(cpu) + set_workqueue_thread_prio(wq, cpu, policy, + rt_priority, nice); + } + unlock_cpu_hotplug(); +} + + void destroy_workqueue(struct workqueue_struct *wq) { int cpu; @@ -542,6 +577,7 @@ hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); + set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); } EXPORT_SYMBOL_GPL(__create_workqueue); Index: 2.6-8xx/kernel/module.c =================================================================== --- 2.6-8xx.orig/kernel/module.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/module.c 2005-06-16 13:45:08.000000000 -0300 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -96,6 +97,16 @@ */ void __module_put_and_exit(struct module *mod, long code) { + /* + * Release the kernel lock if held: + */ + if (current->lock_depth >= 0) { + printk("BUG: module %s holds the BKL [%d] at exit time!\n", + mod->name, current->lock_depth); + dump_stack(); + while (current->lock_depth >= 0) + unlock_kernel(); + } module_put(mod); do_exit(code); } Index: 2.6-8xx/kernel/profile.c =================================================================== --- 2.6-8xx.orig/kernel/profile.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/profile.c 2005-06-16 13:45:08.000000000 -0300 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ static unsigned long prof_len, prof_shift; static int prof_on; static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +int prof_pid = -1; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -49,9 +51,16 @@ static int __init profile_setup(char * str) { + static char __initdata preemptstr[] = "preempt"; static char __initdata schedstr[] = "schedule"; int par; + if (!strncmp(str, preemptstr, strlen(preemptstr))) { + prof_on = PREEMPT_PROFILING; + printk(KERN_INFO "kernel preemption profiling enabled\n"); + if (str[strlen(preemptstr)] == ',') + str += strlen(preemptstr) + 1; + } if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; if (str[strlen(schedstr)] == ',') @@ -277,7 +286,7 @@ up(&profile_flip_mutex); } -void profile_hit(int type, void *__pc) +void notrace profile_hit(int type, void *__pc) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; @@ -383,12 +392,36 @@ } #endif /* !CONFIG_SMP */ -void profile_tick(int type, struct pt_regs *regs) +#ifdef CONFIG_PREEMPT +static void preemption_enabled(void) +{ +} +#endif + +static void preemption_disabled(void) +{ +} + +void notrace profile_tick(int type, struct pt_regs *regs) { if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) - profile_hit(type, (void *)profile_pc(regs)); + if (!user_mode(regs) && (prof_pid == -1 || prof_pid == current->pid) && + cpu_isset(smp_processor_id(), prof_cpu_mask)) { + if (prof_on == PREEMPT_PROFILING && type == CPU_PROFILING) { +#ifdef CONFIG_PREEMPT + int count = preempt_count() - HARDIRQ_OFFSET; + + if (!count) + profile_hit(PREEMPT_PROFILING, + (void *)preemption_enabled); + else +#endif + profile_hit(PREEMPT_PROFILING, + (void *)preemption_disabled); + } else + profile_hit(type, (void *)profile_pc(regs)); + } } #ifdef CONFIG_PROC_FS Index: 2.6-8xx/kernel/Makefile =================================================================== --- 2.6-8xx.orig/kernel/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -9,6 +9,11 @@ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o +obj-$(CONFIG_PREEMPT_RT) += rt.o + +obj-$(CONFIG_DEBUG_PREEMPT) += latency.o +obj-$(CONFIG_LATENCY_TIMING) += latency.o + obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o @@ -26,6 +31,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o Index: 2.6-8xx/kernel/spinlock.c =================================================================== --- 2.6-8xx.orig/kernel/spinlock.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/spinlock.c 2005-06-16 13:45:08.000000000 -0300 @@ -17,151 +17,149 @@ * Generic declaration of the raw read_trylock() function, * architectures are supposed to optimize this: */ -int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock) { - _raw_read_lock(lock); + __raw_read_lock(lock); return 1; } EXPORT_SYMBOL(generic_raw_read_trylock); -int __lockfunc _spin_trylock(spinlock_t *lock) +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { preempt_disable(); - if (_raw_spin_trylock(lock)) + if (__raw_spin_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock); +EXPORT_SYMBOL(_raw_spin_trylock); -int __lockfunc _read_trylock(rwlock_t *lock) +int __lockfunc _raw_read_trylock(raw_rwlock_t *lock) { preempt_disable(); - if (_raw_read_trylock(lock)) + if (__raw_read_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_read_trylock); +EXPORT_SYMBOL(_raw_read_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc _raw_write_trylock(raw_rwlock_t *lock) { preempt_disable(); - if (_raw_write_trylock(lock)) + if (__raw_write_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(_raw_write_trylock); #ifndef CONFIG_PREEMPT -void __lockfunc _read_lock(rwlock_t *lock) +void __lockfunc _raw_read_lock(raw_rwlock_t *lock) { preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(_raw_read_lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); preempt_disable(); - _raw_spin_lock_flags(lock, flags); + __raw_spin_lock_flags(lock, flags); return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(_raw_spin_lock_irqsave); -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { - local_irq_disable(); + raw_local_irq_disable(); preempt_disable(); - _raw_spin_lock(lock); + __raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(_raw_spin_lock_irq); -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); - _raw_spin_lock(lock); + __raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock_bh); +EXPORT_SYMBOL(_raw_spin_lock_bh); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); return flags; } -EXPORT_SYMBOL(_read_lock_irqsave); +EXPORT_SYMBOL(_raw_read_lock_irqsave); -void __lockfunc _read_lock_irq(rwlock_t *lock) +void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock) { - local_irq_disable(); + raw_local_irq_disable(); preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(_raw_read_lock_irq); -void __lockfunc _read_lock_bh(rwlock_t *lock) +void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(_raw_read_lock_bh); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); return flags; } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(_raw_write_lock_irqsave); -void __lockfunc _write_lock_irq(rwlock_t *lock) +void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock) { - local_irq_disable(); + raw_local_irq_disable(); preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(_raw_write_lock_irq); -void __lockfunc _write_lock_bh(rwlock_t *lock) +void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(_raw_write_lock_bh); -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { preempt_disable(); - _raw_spin_lock(lock); + __raw_spin_lock(lock); } +EXPORT_SYMBOL(_raw_spin_lock); -EXPORT_SYMBOL(_spin_lock); - -void __lockfunc _write_lock(rwlock_t *lock) +void __lockfunc _raw_write_lock(raw_rwlock_t *lock) { preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); } - -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(_raw_write_lock); #else /* CONFIG_PREEMPT: */ @@ -174,11 +172,11 @@ */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc _raw_##op##_lock(locktype##_t *lock) \ { \ preempt_disable(); \ for (;;) { \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(__raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ if (!(lock)->break_lock) \ @@ -190,18 +188,18 @@ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ +EXPORT_SYMBOL(_raw_##op##_lock); \ \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc _raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ preempt_disable(); \ for (;;) { \ - local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ + raw_local_irq_save(flags); \ + if (likely(__raw_##op##_trylock(lock))) \ break; \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ \ preempt_enable(); \ if (!(lock)->break_lock) \ @@ -214,16 +212,16 @@ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ +EXPORT_SYMBOL(_raw_##op##_lock_irqsave); \ \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc _raw_##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + _raw_##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ +EXPORT_SYMBOL(_raw_##op##_lock_irq); \ \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc _raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -232,12 +230,12 @@ /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = _raw_##op##_lock_irqsave(lock); \ local_bh_disable(); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_bh) +EXPORT_SYMBOL(_raw_##op##_lock_bh) /* * Build preemption-friendly versions of the following @@ -248,119 +246,156 @@ * _[spin|read|write]_lock_irqsave() * _[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, raw_rwlock); +BUILD_LOCK_OPS(write, raw_rwlock); #endif /* CONFIG_PREEMPT */ -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { - _raw_spin_unlock(lock); + __raw_spin_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(_raw_spin_unlock); -void __lockfunc _write_unlock(rwlock_t *lock) +void __lockfunc _raw_write_unlock(raw_rwlock_t *lock) { - _raw_write_unlock(lock); + __raw_write_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_raw_write_unlock); -void __lockfunc _read_unlock(rwlock_t *lock) +void __lockfunc _raw_read_unlock(raw_rwlock_t *lock) { - _raw_read_unlock(lock); + __raw_read_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(_raw_read_unlock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { - _raw_spin_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __raw_spin_unlock(lock); + __preempt_enable_no_resched(); + raw_local_irq_restore(flags); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { - _raw_spin_unlock(lock); - local_irq_enable(); - preempt_enable(); + __raw_spin_unlock(lock); + __preempt_enable_no_resched(); + raw_local_irq_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(_raw_spin_unlock_irq); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); + __raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_bh_enable(); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(_raw_spin_unlock_bh); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { - _raw_read_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __raw_read_unlock(lock); + __preempt_enable_no_resched(); + raw_local_irq_restore(flags); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); -void __lockfunc _read_unlock_irq(rwlock_t *lock) +void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock) { - _raw_read_unlock(lock); - local_irq_enable(); - preempt_enable(); + __raw_read_unlock(lock); + __preempt_enable_no_resched(); + raw_local_irq_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(_raw_read_unlock_irq); -void __lockfunc _read_unlock_bh(rwlock_t *lock) +void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock) { - _raw_read_unlock(lock); - preempt_enable_no_resched(); + __raw_read_unlock(lock); + __preempt_enable_no_resched(); local_bh_enable(); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(_raw_read_unlock_bh); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { - _raw_write_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __raw_write_unlock(lock); + __preempt_enable_no_resched(); + raw_local_irq_restore(flags); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); -void __lockfunc _write_unlock_irq(rwlock_t *lock) +void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock) { - _raw_write_unlock(lock); - local_irq_enable(); - preempt_enable(); + __raw_write_unlock(lock); + __preempt_enable_no_resched(); + raw_local_irq_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(_raw_write_unlock_irq); -void __lockfunc _write_unlock_bh(rwlock_t *lock) +void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock) { - _raw_write_unlock(lock); - preempt_enable_no_resched(); + __raw_write_unlock(lock); + __preempt_enable_no_resched(); local_bh_enable(); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(_raw_write_unlock_bh); -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); - if (_raw_spin_trylock(lock)) + if (__raw_spin_trylock(lock)) return 1; - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_raw_spin_trylock_bh); + +int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock) +{ + raw_local_irq_disable(); + preempt_disable(); + if (__raw_spin_trylock(lock)) + return 1; + + __preempt_enable_no_resched(); + raw_local_irq_enable(); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(_raw_spin_trylock_irq); + +int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock, + unsigned long *flags) +{ + raw_local_irq_save(*flags); + preempt_disable(); + if (__raw_spin_trylock(lock)) + return 1; + + __preempt_enable_no_resched(); + raw_local_irq_restore(*flags); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(_raw_spin_trylock_irqsave); -int in_lock_functions(unsigned long addr) +int notrace in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; Index: 2.6-8xx/kernel/timer.c =================================================================== --- 2.6-8xx.orig/kernel/timer.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/timer.c 2005-06-16 13:45:08.000000000 -0300 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -69,6 +70,7 @@ spinlock_t lock; unsigned long timer_jiffies; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; tvec_root_t tv1; tvec_t tv2; tvec_t tv3; @@ -81,13 +83,13 @@ static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) base->running_timer = timer; #endif } /* Fake initialization */ -static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; +static DEFINE_PER_CPU(tvec_base_t, tvec_bases); static void check_timer_failed(struct timer_list *timer) { @@ -160,14 +162,15 @@ { tvec_base_t *old_base, *new_base; unsigned long flags; - int ret = 0; + int ret = 0, cpu; BUG_ON(!timer->function); check_timer(timer); spin_lock_irqsave(&timer->lock, flags); - new_base = &__get_cpu_var(tvec_bases); + cpu = _smp_processor_id(); + new_base = &per_cpu(tvec_bases, cpu); repeat: old_base = timer->base; @@ -320,7 +323,7 @@ EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) /*** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -355,10 +358,8 @@ for_each_online_cpu(i) { base = &per_cpu(tvec_bases, i); if (base->running_timer == timer) { - while (base->running_timer == timer) { - cpu_relax(); - preempt_check_resched(); - } + wait_event(base->wait_for_running_timer, + base->running_timer != timer); break; } } @@ -436,13 +437,30 @@ static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; + unsigned long jiffies_sample = jiffies; spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { + while (time_after_eq(jiffies_sample, base->timer_jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + + if (softirq_need_resched()) { + /* running_timer might be stale: */ + set_running_timer(base, NULL); +// if (waitqueue_active(&base->wait_running_timer)) + wake_up(&base->wait_for_running_timer); + spin_unlock_irq(&base->lock); + cond_resched_all(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } /* * Cascade timers: */ @@ -471,16 +489,20 @@ u32 preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + cond_resched_all(); spin_lock_irq(&base->lock); goto repeat; } } set_running_timer(base, NULL); spin_unlock_irq(&base->lock); +// if (waitqueue_active(&base->wait_running_timer)) + wake_up(&base->wait_for_running_timer); } #ifdef CONFIG_NO_IDLE_HZ @@ -814,8 +836,8 @@ */ void update_process_times(int user_tick) { - struct task_struct *p = current; int cpu = smp_processor_id(); + struct task_struct *p = current; /* Note: this timer irq context must be accounted for as well. */ if (user_tick) @@ -826,7 +848,9 @@ if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); +#ifndef CONFIG_PREEMPT_RT run_posix_cpu_timers(p); +#endif } /* @@ -876,23 +900,12 @@ * playing with xtime and avenrun. */ #ifndef ARCH_HAVE_XTIME_LOCK -seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +DECLARE_RAW_SEQLOCK(xtime_lock); EXPORT_SYMBOL(xtime_lock); #endif /* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - tvec_base_t *base = &__get_cpu_var(tvec_bases); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* * Called by the local, per-CPU timer interrupt on SMP. */ void run_local_timers(void) @@ -901,22 +914,48 @@ } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! + * Time of day handling: */ static inline void update_times(void) { - unsigned long ticks; + unsigned long ticks = 0; + /* + * First test outside the lock for performance reasons: + */ + if (jiffies != wall_jiffies) { + unsigned long flags; - ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); + write_seqlock_irqsave(&xtime_lock, flags); + while (jiffies != wall_jiffies) { + wall_jiffies++; + ticks++; + update_wall_time(1); + /* + * Unlock unconditionally, to make sure + * we dont keep irqs off for a long time! + */ + write_sequnlock_irqrestore(&xtime_lock, flags); + cond_resched_softirq(); + write_seqlock_irqsave(&xtime_lock, flags); + } + calc_load(ticks); + write_sequnlock_irqrestore(&xtime_lock, flags); } - calc_load(ticks); } /* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + tvec_base_t *base = &__get_cpu_var(tvec_bases); + + update_times(); + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. * jiffies is defined in the linker script... @@ -925,7 +964,7 @@ void do_timer(struct pt_regs *regs) { jiffies_64++; - update_times(); + softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1289,6 +1328,8 @@ base = &per_cpu(tvec_bases, cpu); spin_lock_init(&base->lock); + init_waitqueue_head(&base->wait_for_running_timer); + for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); Index: 2.6-8xx/kernel/sys.c =================================================================== --- 2.6-8xx.orig/kernel/sys.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/sys.c 2005-06-16 13:45:08.000000000 -0300 @@ -165,7 +165,7 @@ * of the last notifier function called. */ -int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int notrace notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; Index: 2.6-8xx/kernel/power/swsusp.c =================================================================== --- 2.6-8xx.orig/kernel/power/swsusp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/power/swsusp.c 2005-06-16 13:45:08.000000000 -0300 @@ -924,6 +924,7 @@ BUG_ON(!error); restore_processor_state(); restore_highmem(); + touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); return error; Index: 2.6-8xx/kernel/latency.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/kernel/latency.c 2005-06-16 14:37:30.000000000 -0300 @@ -0,0 +1,1906 @@ +/* + * kernel/latency.c + * + * Copyright (C) 2004 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PPC +#include +#endif + +#ifdef __i386__ +static inline cycles_t cycles(void) +{ + unsigned long long ret; + + rdtscll(ret); + + return ret; +} +#else +# define cycles() get_cycles() +#endif + +#ifdef CONFIG_WAKEUP_TIMING +struct sch_struct { + raw_spinlock_t trace_lock; + struct task_struct *task; + int cpu; + struct cpu_trace *tr; +} ____cacheline_aligned_in_smp; + +static __cacheline_aligned_in_smp struct sch_struct sch = + { trace_lock: RAW_SPIN_LOCK_UNLOCKED }; + +int wakeup_timing = 1; +#endif + +#ifdef CONFIG_LATENCY_TIMING + +/* + * Maximum preemption latency measured. Initialize to maximum, + * we clear it after bootup. + */ +static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX; +static cycles_t preempt_thresh; + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycles_t delta) +{ + if (preempt_thresh) { + if (delta < preempt_thresh) + return 0; + } else { + if (delta <= preempt_max_latency) + return 0; + } + return 1; +} + +/* + * Track maximum latencies and save the trace: + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex); +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp int max_sequence; + +enum trace_type +{ + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_SPECIAL, + TRACE_SPECIAL_PID, + TRACE_CMDLINE, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type +{ + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, + TRACE_FLAG_IRQS_HARD_OFF = 0x16, +}; + + +#ifdef CONFIG_LATENCY_TRACE + +#define MAX_TRACE (unsigned long)(4096-1) + +#define CMDLINE_BYTES 16 + +/* + * 32 bytes on 32-bit platforms: + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; // assumes PREEMPT_MASK is 8 bits or less + int pid; + cycles_t timestamp; + union { + struct { + unsigned long eip; + unsigned long parent_eip; + } fn; + struct { + unsigned long eip; + unsigned long v1, v2, v3; + } special; + struct { + unsigned char str[CMDLINE_BYTES]; + } cmdline; + struct { + unsigned int nr; + unsigned long p1, p2, p3; + } syscall; + struct { + unsigned int ret; + } sysret; + struct { + int __pad3[4]; + } pad; + } u; +} __attribute__((packed)); + +#endif + +struct cpu_trace { + atomic_t disabled; + unsigned long trace_idx; + cycles_t preempt_timestamp; + unsigned long critical_start, critical_end; + int critical_sequence; + int early_warning; + +#ifdef CONFIG_LATENCY_TRACE + struct trace_entry trace[MAX_TRACE]; + char comm[CMDLINE_BYTES]; + pid_t pid; + unsigned long uid; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long saved_latency; +#endif + +} ____cacheline_aligned_in_smp; + +static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp; + +static unsigned long notrace cycles_to_usecs(cycles_t delta) +{ +#ifdef CONFIG_X86 + do_div(delta, cpu_khz/1000+1); +#elif defined(CONFIG_PPC) + delta = mulhwu(tb_to_us, delta); +#else + #error Implement cycles_to_usecs. +#endif + + return (unsigned long) delta; +} + +static cycles_t notrace usecs_to_cycles(unsigned long delta) +{ + return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1); +} + +#ifdef CONFIG_LATENCY_TRACE + +int trace_enabled = 1; +int mcount_enabled = 1; +int trace_freerunning = 0; +int trace_print_at_crash = 0; +int trace_verbose = 0; +int trace_all_cpus = 0; + +/* + * user-triggered via gettimeofday(0,1)/gettimeofday(0,0) + */ +int trace_user_triggered = 0; + +struct saved_trace_struct { + int cpu; + cycles_t first_timestamp, last_timestamp; + struct cpu_trace traces[NR_CPUS]; +} ____cacheline_aligned_in_smp; + +/* + * The current worst-case trace: + */ +static struct saved_trace_struct max_tr; + +/* + * /proc/latency_trace atomicity: + */ +static DECLARE_MUTEX(out_mutex); + +static struct saved_trace_struct out_tr; + + +static void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3) +{ + struct trace_entry *entry; + +#ifdef CONFIG_DEBUG_PREEMPT +// WARN_ON(!atomic_read(&tr->disabled)); +#endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("BUG: mcount: stack overflow: %ld [%08lx...%08lx...%08lx]\n", + esp - sizeof(struct thread_info), (long)&esp, (long)current_thread_info(), (long)current_thread_info() + THREAD_SIZE); + dump_stack(); + } + } +#endif + + if (likely(tr->critical_start) || unlikely(trace_user_triggered || trace_all_cpus)) + if (tr->trace_idx < MAX_TRACE) { + u32 pc = preempt_count(); + + entry = tr->trace + tr->trace_idx; + entry->type = type; +#ifdef CONFIG_SMP + entry->cpu = cpu; +#endif + entry->flags = (irqs_disabled() ? TRACE_FLAG_IRQS_OFF : 0) | + + (raw_irqs_disabled() ? TRACE_FLAG_IRQS_HARD_OFF : 0)| + + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + entry->preempt_count = pc & 0xff; + entry->pid = current->pid; + entry->timestamp = cycles(); + + switch (type) { + case TRACE_FN: + entry->u.fn.eip = eip; + entry->u.fn.parent_eip = parent_eip; + break; + case TRACE_SPECIAL: + case TRACE_SPECIAL_PID: + entry->u.special.eip = eip; + entry->u.special.v1 = v1; + entry->u.special.v2 = v2; + entry->u.special.v3 = v3; + break; + case TRACE_SYSCALL: + entry->u.syscall.nr = eip; + entry->u.syscall.p1 = v1; + entry->u.syscall.p2 = v2; + entry->u.syscall.p3 = v3; + break; + case TRACE_SYSRET: + entry->u.sysret.ret = eip; + break; + case TRACE_CMDLINE: + memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES); + break; + default: + break; + } + } + tr->trace_idx++; + if (unlikely(trace_freerunning && (tr->trace_idx >= MAX_TRACE))) + tr->trace_idx = 0; +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr; + + if (unlikely(trace_enabled <= 0)) + return; + + /* + * Trace on the CPU where the current highest-prio task + * is waiting to become runnable: + */ +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing && !trace_all_cpus) { + if (!sch.tr || cpu != sch.cpu) + return; + tr = sch.tr; + } else + tr = cpu_traces + cpu; +#else + tr = cpu_traces + cpu; +#endif + if (likely(!atomic_read(&tr->disabled))) { + atomic_inc(&tr->disabled); + ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3); + atomic_dec(&tr->disabled); + } +} + +/* + * Special, ad-hoc tracepoints: + */ +void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3) +{ + ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3); +} + +EXPORT_SYMBOL(trace_special); + +void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2); +} + +EXPORT_SYMBOL(trace_special_pid); + +/* + * Non-inlined function: + */ +void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ + ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0); +} + +extern void mcount(void); + +EXPORT_SYMBOL(mcount); + +void notrace __mcount(void) +{ + ___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0); +} + +void notrace +sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3); +} + +void notrace sys_ret(int ret) +{ + ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0); +} + +static void notrace print_name(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + /* + * Special trace values: + */ + if (((long)eip < 10000L) && ((long)eip > -10000L)) { + seq_printf(m, "(%ld)", eip); + return; + } + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_puts(m, sym_name); + else + seq_printf(m, "<%08lx>", eip); +} + +static void notrace print_name_offset(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); +} + +static unsigned int out_sequence = -1; +static int pid_to_cmdline_array[PID_MAX_DEFAULT+1]; + +static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ + ____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0); +} + +void notrace trace_cmdline(void) +{ + ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0); +} + +static void construct_pid_to_cmdline(void) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned int i, j, entries, pid; + + if (tr->critical_sequence == out_sequence) + return; + out_sequence = tr->critical_sequence; + + memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1)); + + entries = min(tr->trace_idx, MAX_TRACE-1); + + for (i = 0; i < entries; i++) { + struct trace_entry *entry = tr->trace + i; + + if (entry->type != TRACE_CMDLINE) + continue; + pid = entry->pid; + if (pid < PID_MAX_DEFAULT) { + pid_to_cmdline_array[pid] = i; + /* + * Replace space with underline - makes it easier + * to process for tools: + */ + for (j = 0; j < CMDLINE_BYTES; j++) + if (entry->u.cmdline.str[j] == ' ') + entry->u.cmdline.str[j] = '_'; + } + } +} + +char *pid_to_cmdline(unsigned long pid) +{ + struct cpu_trace *tr = out_tr.traces; + char *cmdline = "<...>"; + int idx; + + pid = min(pid, (unsigned long)PID_MAX_DEFAULT); + if (!pid) + return ""; + + if (pid_to_cmdline_array[pid] != -1) { + idx = pid_to_cmdline_array[pid]; + if (tr->trace[idx].type == TRACE_CMDLINE) + cmdline = tr->trace[idx].u.cmdline.str; + } + return cmdline; +} + +struct block_idx { + int idx[NR_CPUS]; +}; + +/* + * return the trace entry (position) of the smallest-timestamp + * one (that is still in the valid idx range): + */ +static int min_idx(struct block_idx *bidx) +{ + cycles_t min_stamp = (cycles_t) -1; + struct trace_entry *entry; + int cpu, min_cpu = -1, idx; + + for_each_online_cpu(cpu) { + idx = bidx->idx[cpu]; + entry = max_tr.traces[cpu].trace + bidx->idx[cpu]; + if (idx > max_tr.traces[cpu].trace_idx) + continue; + if (entry->timestamp < min_stamp) { + min_cpu = cpu; + min_stamp = entry->timestamp; + } + } + + return min_cpu; +} + +/* + * This code is called to construct an output trace from + * the maximum trace. Having separate traces serves both + * atomicity (a new max might be saved while we are busy + * accessing /proc/latency_trace) and it is also used to + * delay the (expensive) sorting of the output trace by + * timestamps, in the trace_all_cpus case. + */ +static void update_out_trace(void) +{ + int cpu, sum, entries; + struct cpu_trace *tmp_max, *tmp_out; + struct trace_entry *out_entry, *entry; + struct block_idx bidx = { { 0, } }; + cycles_t stamp, first_stamp = 0, last_stamp = (cycles_t)-1; + + /* + * Nasty trick. We might overflow the first array but + * there are NR_CPUS of them so we use it as a 'big' + * trace buffer. + */ + tmp_out = out_tr.traces + 0; + *tmp_out = max_tr.traces[max_tr.cpu]; + out_tr.cpu = max_tr.cpu; + out_entry = tmp_out->trace + 0; + + if (!trace_all_cpus) { + entries = min(tmp_out->trace_idx, MAX_TRACE-1); + if (!entries) + return; + out_tr.first_timestamp = tmp_out->trace[0].timestamp; + out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp; + return; + } + /* + * Find the range of timestamps that are fully traced in + * all CPU traces. (since CPU traces can cover a variable + * range of time, we have to find the best range.) + */ + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + stamp = tmp_max->trace[0].timestamp; + if (stamp > first_stamp) + first_stamp = stamp; + } + /* + * Save the timestamp range: + */ + + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE-1); + /* + * No saved trace yet? + */ + if (!entries) { + out_tr.traces[0].trace_idx = 0; + return; + } + + last_stamp = tmp_max->trace[entries-1].timestamp; + + WARN_ON(last_stamp < first_stamp); + + out_tr.first_timestamp = first_stamp; + out_tr.last_timestamp = last_stamp; + + + /* + * Fetch trace entries one by one, in increasing timestamp + * order. Start at first_stamp, stop at last_stamp: + */ + sum = 0; + for (;;) { + cpu = min_idx(&bidx); + if (cpu == -1) + break; + entry = max_tr.traces[cpu].trace + bidx.idx[cpu]; + if (entry->timestamp > last_stamp) { + break; + } + + bidx.idx[cpu]++; + if (entry->timestamp < first_stamp) + continue; + *out_entry = *entry; + out_entry++; + sum++; + } + + WARN_ON(sum > MAX_TRACE*NR_CPUS); + tmp_out->trace_idx = sum; +} + +static void * notrace l_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned long entries; + struct cpu_trace *tr; + + down(&out_mutex); + /* + * if the file is being read newly, update the output trace: + */ + if (!n) { + // TODO: use the sequence counter here to optimize + down(&max_mutex); + update_out_trace(); + up(&max_mutex); + if (!out_tr.traces[0].trace_idx) { + up(&out_mutex); + return NULL; + } + construct_pid_to_cmdline(); + } + tr = out_tr.traces; + entries = min(tr->trace_idx, MAX_TRACE); + + if (!n) { + seq_printf(m, "preemption latency trace v1.1.4 on %s\n", UTS_RELEASE); + seq_puts(m, "--------------------------------------------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d #P:%d)\n", + cycles_to_usecs(tr->saved_latency), + entries, tr->trace_idx, out_tr.cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "rt", +#endif + voluntary_preemption, kernel_preemption, + softirq_preemption, hardirq_preemption, + num_online_cpus()); + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n", + tr->comm, tr->pid, tr->uid, tr->nice, + tr->policy, tr->rt_priority); + seq_puts(m, " -----------------\n"); + if (trace_user_triggered) { + seq_puts(m, " => started at: "); + print_name_offset(m, tr->critical_start); + seq_puts(m, "\n => ended at: "); + print_name_offset(m, tr->critical_end); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + seq_puts(m, " _------=> CPU# \n"); + seq_puts(m, " / _-----=> irqs-off \n"); + seq_puts(m, " | / _----=> need-resched \n"); + seq_puts(m, " || / _---=> hardirq/softirq \n"); + seq_puts(m, " ||| / _--=> preempt-depth \n"); + seq_puts(m, " |||| / \n"); + seq_puts(m, " ||||| delay \n"); + seq_puts(m, " cmd pid ||||| time | caller \n"); + seq_puts(m, " \\ / ||||| \\ | / \n"); + + } + if (n >= entries) + return NULL; + + return tr->trace + n; +} + +static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned long entries = min(tr->trace_idx, MAX_TRACE); + + if (++*pos >= entries) { + if (*pos == entries) + seq_puts(m, "\n\nvim:ft=help\n"); + return NULL; + } + return tr->trace + *pos; +} + +static void notrace l_stop(struct seq_file *m, void *p) +{ + up(&out_mutex); +} + +static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static void +print_timestamp_short(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_putc(m, '!'); + else if (rel_usecs > 1) + seq_putc(m, '+'); + else + seq_putc(m, ' '); +} + +static void +print_generic(struct seq_file *m, struct trace_entry *entry) +{ + int hardirq, softirq; + + seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid); + seq_printf(m, "%d", entry->cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.'); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + + +static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + if (trace_verbose) { + seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lu] %ld.%03ldms (+%ld.%03ldms): ", + pid_to_cmdline(entry->pid), + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, trace_idx, + entry->timestamp, abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + print_name_offset(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name_offset(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } else { + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + print_name(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } + return 0; +} + +static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " (%lx %lx %lx)\n", + entry->u.special.v1, entry->u.special.v2, entry->u.special.v3); + + return 0; +} + +static int notrace +l_show_special_pid(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int pid; + + pid = entry->u.special.v1; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " <%.8s-%d> (%lx %lx)\n", + pid_to_cmdline(pid), pid, + entry->u.special.v2, entry->u.special.v3); + + return 0; +} + + +static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + if (!trace_verbose) + return 0; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + seq_printf(m, + "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n", + entry->u.cmdline.str, + abs_usecs/1000, abs_usecs % 1000, + rel_usecs/1000, rel_usecs % 1000); + + return 0; +} + +#ifdef CONFIG_X86_64 +# define NR_syscalls (__NR_syscall_max + 1) +#endif + +extern unsigned long sys_call_table[NR_syscalls]; + +static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int nr; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_puts(m, "> "); + nr = entry->u.syscall.nr; + if (nr < NR_syscalls) + print_name(m, sys_call_table[entry->u.syscall.nr]); + else + seq_puts(m, ""); + + seq_printf(m, " (%08lx %08lx %08lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); + + return 0; +} + +static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_printf(m, "< (%d)\n", entry->u.sysret.ret); + + return 0; +} + + +static int notrace l_show(struct seq_file *m, void *p) +{ + struct cpu_trace *tr = out_tr.traces; + struct trace_entry *entry, *entry0, *next_entry; + unsigned long trace_idx; + + cond_resched(); + entry = p; + if (entry->timestamp < out_tr.first_timestamp) + return 0; + if (entry->timestamp > out_tr.last_timestamp) + return 0; + + entry0 = tr->trace; + trace_idx = entry - entry0; + + if (trace_idx + 1 < tr->trace_idx) + next_entry = entry + 1; + else + next_entry = entry; + + if (trace_verbose) + seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx); + + switch (entry->type) { + case TRACE_FN: + l_show_fn(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL: + l_show_special(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL_PID: + l_show_special_pid(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_CMDLINE: + l_show_cmdline(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSCALL: + l_show_syscall(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSRET: + l_show_sysret(m, trace_idx, entry, entry0, next_entry); + break; + default: + seq_printf(m, "unknown trace type %d\n", entry->type); + } + return 0; +} + +struct seq_operations latency_trace_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr) +{ + /* free-running needs reordering */ + if (trace_freerunning) { + int i, idx, idx0 = tr->trace_idx; + + for (i = 0; i < MAX_TRACE; i++) { + idx = (idx0 + i) % MAX_TRACE; + save->trace[i] = tr->trace[idx]; + } + save->trace_idx = MAX_TRACE-1; + } else { + save->trace_idx = tr->trace_idx; + + memcpy(save->trace, tr->trace, + min(save->trace_idx + 1, MAX_TRACE) * + sizeof(struct trace_entry)); + } +} + +static void update_max_tr(struct cpu_trace *tr) +{ + struct cpu_trace *save; + int this_cpu = smp_processor_id(), cpu, all_cpus = 0; + +#ifdef CONFIG_PREEMPT + WARN_ON(!preempt_count() && !raw_irqs_disabled()); +#endif + + max_tr.cpu = this_cpu; + save = max_tr.traces + this_cpu; + + if ((wakeup_timing || trace_user_triggered) && trace_all_cpus) { + all_cpus = 1; + for_each_online_cpu(cpu) + atomic_inc(&cpu_traces[cpu].disabled); + } + + save->saved_latency = preempt_max_latency; + save->preempt_timestamp = tr->preempt_timestamp; + save->critical_start = tr->critical_start; + save->critical_end = tr->critical_end; + save->critical_sequence = tr->critical_sequence; + + memcpy(save->comm, current->comm, CMDLINE_BYTES); + save->pid = current->pid; + save->uid = current->uid; + save->nice = current->static_prio - 20 - MAX_RT_PRIO; + save->policy = current->policy; + save->rt_priority = current->rt_priority; + + if (all_cpus) { + for_each_online_cpu(cpu) { + copy_trace(max_tr.traces + cpu, cpu_traces + cpu); + atomic_dec(&cpu_traces[cpu].disabled); + } + } else + copy_trace(save, tr); +} + +#else /* !LATENCY_TRACE */ + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3) +{ +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ +} + +static inline void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ +} + +static inline void update_max_tr(struct cpu_trace *tr) +{ +} + +static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ +} + +#endif + +static int setup_preempt_thresh(char *s) +{ + int thresh; + + get_option(&s, &thresh); + if (thresh > 0) { + preempt_thresh = usecs_to_cycles(thresh); + printk("Preemption threshold = %u us\n", thresh); + } + return 1; +} +__setup("preempt_thresh=", setup_preempt_thresh); + +#ifdef CONFIG_CRITICAL_TIMING + +static void notrace +check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1; + cycles_t T1, T0, delta; + + if (trace_user_triggered) + return; + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = tr->preempt_timestamp; + T1 = cycles(); + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T1 = cycles(); + delta = T1-T0; + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + latency = cycles_to_usecs(delta); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + _smp_processor_id(), + latency, cycles_to_usecs(preempt_thresh), t0); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + _smp_processor_id(), + latency, t0); + + print_symbol("<%s>\n", tr->critical_start); + printk(" => ended at timestamp %lu: ", t1); + print_symbol("<%s>\n", tr->critical_end); + dump_stack(); + t1 = cycles_to_usecs(cycles()); + printk(" => dump-end timestamp %lu\n\n", t1); + + max_sequence++; + + up(&max_mutex); +out: + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + tr->early_warning = 0; + tr->trace_idx = 0; + _trace_cmdline(cpu, tr); + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0); +} + +void notrace touch_critical_timing(void) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + if (preempt_count() > 0 && tr->critical_start) { + atomic_inc(&tr->disabled); + check_critical_timing(cpu, tr, CALLER_ADDR0); + tr->critical_start = CALLER_ADDR0; + tr->critical_sequence = max_sequence; + atomic_dec(&tr->disabled); + } +} +EXPORT_SYMBOL(touch_critical_timing); + +void notrace stop_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + _smp_processor_id(); + + tr->critical_start = 0; +} +EXPORT_SYMBOL(stop_critical_timing); + +static inline void notrace +__start_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + tr->critical_start = eip; + tr->trace_idx = 0; + _trace_cmdline(cpu, tr); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0); + + atomic_dec(&tr->disabled); +} + +static inline void notrace +__stop_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0); + check_critical_timing(cpu, tr, eip); + tr->critical_start = 0; + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING +# define irqs_off_preempt_count() preempt_count() +#else +# define irqs_off_preempt_count() 0 +#endif + +void notrace trace_irqs_off_lowlevel(void) +{ + unsigned long flags; + + raw_local_save_flags(flags); + + if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, 0); +} + +void notrace trace_irqs_off(void) +{ + unsigned long flags; + + raw_local_save_flags(flags); + + if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +EXPORT_SYMBOL(trace_irqs_off); + +void notrace trace_irqs_on(void) +{ + unsigned long flags; + + raw_local_save_flags(flags); + + if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags)) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +EXPORT_SYMBOL(trace_irqs_on); + +#endif + +#endif /* LATENCY_TIMING */ + +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + +static inline unsigned long get_parent_eip(void) +{ + unsigned long parent_eip = CALLER_ADDR1; + + if (in_lock_functions(parent_eip)) { + parent_eip = CALLER_ADDR2; + if (in_lock_functions(parent_eip)) + parent_eip = CALLER_ADDR3; + } + + return parent_eip; +} + +void notrace add_preempt_count(unsigned int val) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +#endif + + preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __start_critical_timing(eip, parent_eip); + } +#endif + (void)eip, (void)parent_eip; +} +EXPORT_SYMBOL(add_preempt_count); + +void notrace sub_preempt_count(unsigned int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + BUG_ON(unlikely(val > preempt_count())); + + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); +#endif + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +#endif + preempt_count() -= val; +} + +EXPORT_SYMBOL(sub_preempt_count); + +void notrace mask_preempt_count(unsigned int mask) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + + preempt_count() |= mask; + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __start_critical_timing(eip, parent_eip); + } +#endif + (void) eip, (void) parent_eip; +} +EXPORT_SYMBOL(mask_preempt_count); + +void notrace unmask_preempt_count(unsigned int mask) +{ +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +#endif + preempt_count() &= ~mask; +} +EXPORT_SYMBOL(unmask_preempt_count); + + +#endif + +/* + * Wakeup latency timing/tracing. We get upcalls from the scheduler + * when a task is being woken up and we time/trace it until it gets + * to a CPU - or an even-higher-prio task supercedes it. (in that + * case we throw away the currently traced task - we dont try to + * handle nesting, that simplifies things significantly) + */ +#ifdef CONFIG_WAKEUP_TIMING + +static void notrace +check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency; + unsigned long t0, t1; + cycles_t T0, T1, delta; + + if (trace_user_triggered) + return; + + atomic_inc(&tr->disabled); + if (atomic_read(&tr->disabled) != 1) + goto out; + + T0 = tr->preempt_timestamp; + T1 = cycles(); + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0); + T1 = cycles(); + delta = T1-T0; + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + latency = cycles_to_usecs(delta); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + _smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "wakeup.\n", current->comm, current->pid, + _smp_processor_id(), latency); + + max_sequence++; + + up(&max_mutex); +out: + atomic_dec(&tr->disabled); +} + +/* + * Start wakeup latency tracing - called with the runqueue held + * and interrupts disabled: + */ +void __trace_start_sched_wakeup(struct task_struct *p) +{ + struct cpu_trace *tr; + int cpu; + + if (trace_user_triggered || !wakeup_timing) + return; + + spin_lock(&sch.trace_lock); + if (sch.task && (sch.task->prio >= p->prio)) + goto out_unlock; + /* + * New highest-prio task just woke up - start tracing: + */ + sch.task = p; + sch.cpu = task_cpu(p); + /* + * We keep using this CPU's trace buffer even if the task + * gets migrated to another CPU. Tracing only happens on + * the CPU that 'owns' the highest-prio task so it's + * fundamentally single-threaded. + */ + sch.tr = tr = cpu_traces + sch.cpu; + if (trace_all_cpus) + for_each_online_cpu(cpu) + cpu_traces[cpu].trace_idx = 0; + else + tr->trace_idx = 0; + +// if (!atomic_read(&tr->disabled)) { +// atomic_inc(&tr->disabled); + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(_smp_processor_id(), tr); +// atomic_dec(&tr->disabled); +// } + + mcount(); +out_unlock: + spin_unlock(&sch.trace_lock); +} + +void trace_stop_sched_switched(struct task_struct *p) +{ + struct cpu_trace *tr; + unsigned long flags; + + if (trace_user_triggered || !wakeup_timing) + return; + + trace_special_pid(p->pid, p->prio, 0); + + raw_local_irq_save(flags); + spin_lock(&sch.trace_lock); + if (p == sch.task) { + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + WARN_ON(!tr); + /* + * Somewhat racy but safer - the printks within + * check_wakeup_timing() can call back into the + * wakup-timing code and deadlock: + */ +// atomic_inc(&tr->disabled); + spin_unlock(&sch.trace_lock); + check_wakeup_timing(tr, CALLER_ADDR0); +// atomic_dec(&tr->disabled); + } else { + if (sch.task) + trace_special_pid(sch.task->pid, sch.task->prio, p->prio); + if (sch.task && (sch.task->prio >= p->prio)) + sch.task = NULL; + spin_unlock(&sch.trace_lock); + } + raw_local_irq_restore(flags); +} + +void trace_change_sched_cpu(struct task_struct *p, int new_cpu) +{ + unsigned long flags; + + if (!wakeup_timing) + return; + + trace_special(task_cpu(p), task_cpu(p), new_cpu); + raw_local_irq_save(flags); + spin_lock(&sch.trace_lock); + if (p == sch.task && task_cpu(p) != new_cpu) { + sch.cpu = new_cpu; + trace_special(task_cpu(p), new_cpu, 0); + } + spin_unlock(&sch.trace_lock); + raw_local_irq_restore(flags); +} + +#endif + +#ifdef CONFIG_LATENCY_TRACE + +long user_trace_start(void) +{ + struct cpu_trace *tr; + int cpu; + + if (!trace_user_triggered || trace_print_at_crash) + return -EINVAL; + + if (down_trylock(&max_mutex)) + return -EAGAIN; + + preempt_disable(); + tr = cpu_traces + smp_processor_id(); + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + unsigned long flags; + + raw_local_irq_save(flags); + spin_lock(&sch.trace_lock); + sch.task = current; + sch.cpu = smp_processor_id(); + sch.tr = tr; + spin_unlock(&sch.trace_lock); + raw_local_irq_restore(flags); + } +#endif + if (trace_all_cpus) + for_each_online_cpu(cpu) + cpu_traces[cpu].trace_idx = 0; + else + tr->trace_idx = 0; + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + _trace_cmdline(_smp_processor_id(), tr); + mcount(); + preempt_enable(); + + up(&max_mutex); + + return 0; +} + +long user_trace_stop(void) +{ + unsigned long latency; + struct cpu_trace *tr; + cycles_t delta; + + if (!trace_user_triggered || trace_print_at_crash) + return -EINVAL; + + preempt_disable(); + mcount(); + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + unsigned long flags; + + raw_local_irq_save(flags); + spin_lock(&sch.trace_lock); + if (current != sch.task) { + spin_unlock(&sch.trace_lock); + raw_local_irq_restore(flags); + preempt_enable(); + return -EINVAL; + } + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + spin_unlock(&sch.trace_lock); + raw_local_irq_restore(flags); + } else +#endif + tr = cpu_traces + smp_processor_id(); + + atomic_inc(&tr->disabled); + if (tr->preempt_timestamp) { + delta = cycles() - tr->preempt_timestamp; + if (!report_latency(delta)) + goto out; + if (tr->critical_sequence != max_sequence || + down_trylock(&max_mutex)) + goto out; + + preempt_max_latency = delta; + update_max_tr(tr); + + latency = cycles_to_usecs(delta); + + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us user-latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + _smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us user-latency.\n", + current->comm, current->pid, + _smp_processor_id(), latency); + + max_sequence++; + up(&max_mutex); +out: + tr->preempt_timestamp = 0; + } + atomic_dec(&tr->disabled); + preempt_enable(); + + return 0; +} + +EXPORT_SYMBOL(user_trace_stop); + +void stop_trace(void) +{ + if (trace_print_at_crash) + trace_enabled = -1; +} + +static void notrace printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + printk("%s+%#lx/%#lx", sym_name, offset, size); + else + printk("<%08lx>", eip); +} + +static void print_entry(struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + int hardirq, softirq; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + printk("%-5d ", entry->pid); + + printk("%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.'); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + printk("H"); + else { + if (hardirq) + printk("h"); + else { + if (softirq) + printk("s"); + else + printk("."); + } + } + + printk(":%d %ld.%03ldms: ", + entry->preempt_count, abs_usecs/1000, abs_usecs % 1000); + + printk_name(entry->u.fn.eip); + printk(" <= ("); + printk_name(entry->u.fn.parent_eip); + printk(")\n"); +} + +void print_last_trace(void) +{ + unsigned int idx0, idx, i; + struct cpu_trace *tr; + struct trace_entry *entry0, *entry, *next_entry; + + if (trace_enabled != -1) + return; + + preempt_disable(); + tr = cpu_traces + smp_processor_id(); + + printk("Last %ld trace entries:\n", MAX_TRACE); + idx0 = tr->trace_idx; + printk("curr idx: %d\n", idx0); + if (idx0 >= MAX_TRACE) + idx0 = MAX_TRACE-1; + idx = idx0; + entry0 = tr->trace + idx0; + + for (i = 0; i < MAX_TRACE; i++) { + entry = tr->trace + idx; + idx++; + if (idx == MAX_TRACE) + idx = 0; + next_entry = tr->trace + idx; + if (entry->type == TRACE_FN) + print_entry(entry, entry0, next_entry); + } + trace_print_at_crash = 0; + + preempt_enable(); +} + +#ifdef CONFIG_SMP +/* + * On SMP, try to 'peek' on other CPU's traces and record them + * in this CPU's trace. This way we get a rough idea about what's + * going on there, without the overhead of global tracing. + * + * (no need to make this PER_CPU, we bounce it around anyway.) + */ +unsigned long nmi_eips[NR_CPUS]; +unsigned long nmi_flags[NR_CPUS]; + +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + int cpu, this_cpu = smp_processor_id(); + + __trace(eip, parent_eip); + + nmi_eips[this_cpu] = parent_eip; + nmi_flags[this_cpu] = flags; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu) && cpu != this_cpu) { + __trace(eip, nmi_eips[cpu]); + __trace(eip, nmi_flags[cpu]); + } +} +#else +/* + * On UP, NMI tracing is quite simple: + */ +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + __trace(eip, parent_eip); +} +#endif + +#endif + +#ifdef CONFIG_PREEMPT_TRACE + +static void print_preempt_trace(struct task_struct *task) +{ + unsigned int count = task->thread_info->preempt_count; + unsigned int i, lim = count & PREEMPT_MASK; + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} + +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE) +void print_traces(struct task_struct *task) +{ + preempt_disable(); +#ifdef CONFIG_PREEMPT_TRACE + print_preempt_trace(task); +#endif +#ifdef CONFIG_LATENCY_TRACE + print_last_trace(); +#endif + preempt_enable(); +} +#endif + +#ifdef CONFIG_LATENCY_TIMING + +static int preempt_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + cycles_t *max = data; + + return sprintf(page, "%ld\n", cycles_to_usecs(*max)); +} + +static int preempt_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int c, done = 0, val, sum = 0; + cycles_t *max = data; + + while (count) { + if (get_user(c, buffer)) + return -EFAULT; + val = c - '0'; + buffer++; + done++; + count--; + if (c == 0 || c == '\n') + break; + if (val > 9) + return -EINVAL; + sum *= 10; + sum += val; + } + *max = usecs_to_cycles(sum); + return done; +} + +static __init int latency_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("sys/kernel/preempt_max_latency", 0644, NULL); + + entry->nlink = 1; + entry->data = &preempt_max_latency; + entry->read_proc = preempt_read_proc; + entry->write_proc = preempt_write_proc; + + entry = create_proc_entry("sys/kernel/preempt_thresh", 0644, NULL); + + entry->nlink = 1; + entry->data = &preempt_thresh; + entry->read_proc = preempt_read_proc; + entry->write_proc = preempt_write_proc; + + return 0; +} +__initcall(latency_init); + +#endif + Index: 2.6-8xx/kernel/itimer.c =================================================================== --- 2.6-8xx.orig/kernel/itimer.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/itimer.c 2005-06-16 13:45:08.000000000 -0300 @@ -142,7 +142,8 @@ * here because do_setitimer makes sure we have finished running * before it touches anything. */ - it_real_arm(p, p->signal->it_real_incr); + if (p->signal) + it_real_arm(p, p->signal->it_real_incr); } int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) Index: 2.6-8xx/kernel/softirq.c =================================================================== --- 2.6-8xx.orig/kernel/softirq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/softirq.c 2005-06-16 13:45:08.000000000 -0300 @@ -4,6 +4,9 @@ * Copyright (C) 1992 Linus Torvalds * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include @@ -16,6 +19,8 @@ #include #include #include +#include +#include #include /* @@ -43,7 +48,13 @@ static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; +}; + +static DEFINE_PER_CPU(struct softirqdata, ksoftirqd[MAX_SOFTIRQ]); /* * we cannot loop indefinitely here to avoid userspace starvation, @@ -51,16 +62,31 @@ * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -static inline void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk); if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); } /* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } +} +/* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. * @@ -71,7 +97,7 @@ */ #define MAX_SOFTIRQ_RESTART 10 -asmlinkage void __do_softirq(void) +asmlinkage void ___do_softirq(void) { struct softirq_action *h; __u32 pending; @@ -80,37 +106,86 @@ pending = local_softirq_pending(); - local_bh_disable(); cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ local_softirq_pending() = 0; - local_irq_enable(); + raw_local_irq_enable(); h = softirq_vec; do { if (pending & 1) { - h->action(h); + { + u32 preempt_count = preempt_count(); + h->action(h); + if (preempt_count != preempt_count()) { + print_symbol("softirq preempt bug: exited %s with wrong preemption count!\n", (unsigned long) h->action); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } + } rcu_bh_qsctr_inc(cpu); + cond_resched_all(); } h++; pending >>= 1; } while (pending); - local_irq_disable(); + raw_local_irq_disable(); pending = local_softirq_pending(); if (pending && --max_restart) goto restart; if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ + unsigned long p_flags; +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + local_bh_disable(); + p_flags = current->flags & PF_HARDIRQ; + current->flags &= ~PF_HARDIRQ; + + ___do_softirq(); __local_bh_enable(); + + current->flags |= p_flags; +} + +/* + * 'delayed' softirq execution. Does not disable bhs and thus + * makes most of the softirq handlers preemptable - as long as + * they are not executed 'directly'. + */ +asmlinkage void _do_softirq(void) +{ + raw_local_irq_disable(); + if (!softirq_preemption) + __do_softirq(); + else + ___do_softirq(); + raw_local_irq_enable(); } + #ifndef __ARCH_HAS_DO_SOFTIRQ asmlinkage void do_softirq(void) @@ -121,20 +196,22 @@ if (in_interrupt()) return; - local_irq_save(flags); + raw_local_irq_save(flags); pending = local_softirq_pending(); if (pending) __do_softirq(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); #endif +#ifndef CONFIG_PREEMPT_RT + void local_bh_enable(void) { WARN_ON(irqs_disabled()); @@ -152,6 +229,8 @@ } EXPORT_SYMBOL(local_bh_enable); +#endif + #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED # define invoke_softirq() __do_softirq() #else @@ -167,7 +246,7 @@ sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -187,7 +266,7 @@ * schedule the softirq soon. */ if (!in_interrupt()) - wakeup_softirqd(); + trigger_softirqs(); } EXPORT_SYMBOL(raise_softirq_irqoff); @@ -196,9 +275,9 @@ { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); raise_softirq_irqoff(nr); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) @@ -224,11 +303,11 @@ { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); t->next = __get_cpu_var(tasklet_vec).list; __get_cpu_var(tasklet_vec).list = t; raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL(__tasklet_schedule); @@ -237,11 +316,11 @@ { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); t->next = __get_cpu_var(tasklet_hi_vec).list; __get_cpu_var(tasklet_hi_vec).list = t; raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL(__tasklet_hi_schedule); @@ -250,10 +329,10 @@ { struct tasklet_struct *list; - local_irq_disable(); + raw_local_irq_disable(); list = __get_cpu_var(tasklet_vec).list; __get_cpu_var(tasklet_vec).list = NULL; - local_irq_enable(); + raw_local_irq_enable(); while (list) { struct tasklet_struct *t = list; @@ -271,11 +350,11 @@ tasklet_unlock(t); } - local_irq_disable(); + raw_local_irq_disable(); t->next = __get_cpu_var(tasklet_vec).list; __get_cpu_var(tasklet_vec).list = t; __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); + raw_local_irq_enable(); } } @@ -283,10 +362,10 @@ { struct tasklet_struct *list; - local_irq_disable(); + raw_local_irq_disable(); list = __get_cpu_var(tasklet_hi_vec).list; __get_cpu_var(tasklet_hi_vec).list = NULL; - local_irq_enable(); + raw_local_irq_enable(); while (list) { struct tasklet_struct *t = list; @@ -304,11 +383,11 @@ tasklet_unlock(t); } - local_irq_disable(); + raw_local_irq_disable(); t->next = __get_cpu_var(tasklet_hi_vec).list; __get_cpu_var(tasklet_hi_vec).list = t; __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); + raw_local_irq_enable(); } } @@ -347,31 +426,50 @@ open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } -static int ksoftirqd(void * __bind_cpu) +static int ksoftirqd(void * __data) { - set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; + struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 }; + struct softirqdata *data = __data; + u32 mask = (1 << data->nr); + struct softirq_action *h; + + printk("ksoftirqd started up.\n"); + + printk("softirq RT prio: %d.\n", param.sched_priority); +// sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); + if (!(local_softirq_pending() & mask)) { + __preempt_enable_no_resched(); schedule(); preempt_disable(); } - __set_current_state(TASK_RUNNING); - while (local_softirq_pending()) { + while (local_softirq_pending() & mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(data->cpu)) goto wait_to_die; - do_softirq(); - preempt_enable_no_resched(); + + raw_local_irq_disable(); + __preempt_enable_no_resched(); + local_softirq_pending() &= ~mask; + local_bh_disable(); + raw_local_irq_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qsctr_inc(data->cpu); + + __local_bh_enable(); cond_resched(); preempt_disable(); } @@ -423,12 +521,12 @@ BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { struct tasklet_struct **i; /* CPU is dead, so no lock needed. */ - local_irq_disable(); + raw_local_irq_disable(); /* Find end, append list for that CPU. */ for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); @@ -441,40 +539,63 @@ per_cpu(tasklet_hi_vec, cpu).list = NULL; raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); + raw_local_irq_enable(); } #endif /* CONFIG_HOTPLUG_CPU */ +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [SCSI_SOFTIRQ] = "scsi", + [TASKLET_SOFTIRQ] = "tasklet", +}; + static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: - BUG_ON(per_cpu(tasklet_vec, hotcpu).list); - BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + /* We may have tasklets already scheduled on + processor 0, so don't check there. */ + if (hotcpu != 0) { + BUG_ON(per_cpu(tasklet_vec, hotcpu).list); + BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); + } + for (i = 0; i < MAX_SOFTIRQ; i++) { + per_cpu(ksoftirqd[i].nr, hotcpu) = i; + per_cpu(ksoftirqd[i].cpu, hotcpu) = hotcpu; + p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd[i], hotcpu), + "softirq-%s/%d", softirq_names[i], hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d for %i failed\n", i, hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd[i].tsk, hotcpu) = p; } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; break; case CPU_ONLINE: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); + for (i = 0; i < MAX_SOFTIRQ; i++) + wake_up_process(per_cpu(ksoftirqd[i].tsk, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id()); + for (i = 0; i < MAX_SOFTIRQ; i++) + kthread_bind(per_cpu(ksoftirqd[i], hotcpu).tsk, smp_processor_id()); case CPU_DEAD: - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - kthread_stop(p); + for (i = 0; i < MAX_SOFTIRQ; i++) { + p = per_cpu(ksoftirqd[i], hotcpu).tsk; + per_cpu(ksoftirqd[i], hotcpu).tsk = NULL; + kthread_stop(p); + } takeover_tasklets(hotcpu); break; #endif /* CONFIG_HOTPLUG_CPU */ @@ -494,3 +615,33 @@ register_cpu_notifier(&cpu_nfb); return 0; } + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); + +#endif + +#endif + Index: 2.6-8xx/kernel/acct.c =================================================================== --- 2.6-8xx.orig/kernel/acct.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/acct.c 2005-06-16 13:45:08.000000000 -0300 @@ -88,7 +88,7 @@ struct timer_list timer; }; -static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; +static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED(acct_globals.lock)}; /* * Called whenever the timer says to check the free space. Index: 2.6-8xx/kernel/softlockup.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/kernel/softlockup.c 2005-06-16 13:45:08.000000000 -0300 @@ -0,0 +1,159 @@ +/* + * Detect Soft Lockups + * + * started by Ingo Molnar, (C) 2005, Red Hat + * + * this code detects soft lockups: incidents in where on a CPU + * the kernel does not reschedule for 10 seconds or more. + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_RAW_SPINLOCK(print_lock); + +static DEFINE_PER_CPU(unsigned long, timeout) = 0; +static DEFINE_PER_CPU(unsigned long, timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(struct task_struct *, watchdog_task); + +static int did_panic = 0; +static int softlock_panic(struct notifier_block *this, unsigned long event, + void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = softlock_panic, +}; + +void touch_softlockup_watchdog(void) +{ + per_cpu(timestamp, _smp_processor_id()) = jiffies; +} + +/* + * This callback runs from the timer interrupt, and checks + * whether the watchdog thread has hung or not: + */ +void softlockup_tick(struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long timeout = per_cpu(timeout, this_cpu); + unsigned long timestamp = per_cpu(timestamp, this_cpu); + + if (time_after(jiffies, timeout)) { + wake_up_process(per_cpu(watchdog_task, this_cpu)); + per_cpu(timeout, this_cpu) = jiffies + msecs_to_jiffies(1000); + } + + if (per_cpu(print_timestamp, this_cpu) == timestamp) + return; + + /* Do not cause a second panic when there already was one */ + if (did_panic) + return; + + if (time_after(jiffies, timestamp + msecs_to_jiffies(10000))) { + per_cpu(print_timestamp, this_cpu) = timestamp; + + spin_lock(&print_lock); + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", + this_cpu); + show_regs(regs); +#if defined(__i386__) && defined(CONFIG_SMP) + nmi_show_all_regs(); +#endif + spin_unlock(&print_lock); + } +} + +/* + * The watchdog thread - runs every second and touches the timestamp. + */ +static int watchdog(void * __bind_cpu) +{ + struct sched_param param = { .sched_priority = 99 }; + int this_cpu = (long) __bind_cpu; + + printk("softlockup thread %d started up.\n", this_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Run briefly once per second - if this gets delayed for + * more than 10 seconds then the debug-printout triggers + * in softlockup_tick(): + */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + touch_softlockup_watchdog(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __devinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(per_cpu(watchdog_task, hotcpu)); + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); + if (IS_ERR(p)) { + printk("watchdog for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(watchdog_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(watchdog_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + p = per_cpu(watchdog_task, hotcpu); + per_cpu(watchdog_task, hotcpu) = NULL; + kthread_stop(p); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init void spawn_softlockup_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + notifier_chain_register(&panic_notifier_list, &panic_block); +} Index: 2.6-8xx/kernel/futex.c =================================================================== --- 2.6-8xx.orig/kernel/futex.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/futex.c 2005-06-16 13:45:08.000000000 -0300 @@ -588,8 +588,13 @@ * !list_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (likely(!list_empty(&q.list))) + if (likely(!list_empty(&q.list))) { + unsigned long nosched_flag = current->flags & PF_NOSCHED; + + current->flags &= ~PF_NOSCHED; time = schedule_timeout(time); + current->flags |= nosched_flag; + } __set_current_state(TASK_RUNNING); /* Index: 2.6-8xx/kernel/sysctl.c =================================================================== --- 2.6-8xx.orig/kernel/sysctl.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/kernel/sysctl.c 2005-06-16 13:45:08.000000000 -0300 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -275,6 +276,142 @@ .proc_handler = &proc_dointvec, }, { + .ctl_name = KERN_PANIC, + .procname = "prof_pid", + .data = &prof_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PREEMPT + { + .ctl_name = KERN_PANIC, + .procname = "kernel_preemption", + .data = &kernel_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY + { + .ctl_name = KERN_PANIC, + .procname = "voluntary_preemption", + .data = &voluntary_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = KERN_PANIC, + .procname = "softirq_preemption", + .data = &softirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = KERN_PANIC, + .procname = "hardirq_preemption", + .data = &hardirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_WAKEUP_TIMING + { + .ctl_name = KERN_PANIC, + .procname = "wakeup_timing", + .data = &wakeup_timing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LATENCY_TRACE + { + .ctl_name = KERN_PANIC, + .procname = "trace_enabled", + .data = &trace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "mcount_enabled", + .data = &mcount_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_user_triggered", + .data = &trace_user_triggered, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_freerunning", + .data = &trace_freerunning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_print_at_crash", + .data = &trace_print_at_crash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_verbose", + .data = &trace_verbose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_all_cpus", + .data = &trace_all_cpus, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + { + .ctl_name = KERN_PANIC, + .procname = "preempt_locks", + .data = &preempt_locks_user, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_GENERIC_HARDIRQS + { + .ctl_name = KERN_PANIC, + .procname = "debug_direct_keyboard", + .data = &debug_direct_keyboard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, Index: 2.6-8xx/scripts/testlpp.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/scripts/testlpp.c 2005-06-16 13:45:08.000000000 -0300 @@ -0,0 +1,129 @@ +/* + * testlpp.c: use the /dev/lpptest device to test IRQ handling + * latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner + * + * licensed under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static inline unsigned long long int rdtsc(void) +{ + unsigned long long int x, y; + for (;;) { + __asm__ volatile ("rdtsc" : "=A" (x)); + __asm__ volatile ("rdtsc" : "=A" (y)); + if (y - x < 1000) + return y; + } +} + +static unsigned long long calibrate_loop(void) +{ + unsigned long long mytime1, mytime2; + + mytime1 = rdtsc(); + usleep(500000); + mytime2 = rdtsc(); + + return (mytime2 - mytime1) * 2; +} + +#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec) + +int fd, total; +unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec; + +void cleanup(int sig) +{ + ioctl (fd, LPPTEST_ENABLE, &tim); + if (sig) + printf("[ interrupted - exiting ]\n"); + printf("\ntotal number of responses: %d\n", total); + printf("average reponse latency: %.2lf usecs\n", + time_to_usecs(sum_tim/total)); + printf("minimum latency: %.2lf usecs\n", + time_to_usecs(min_tim)); + printf("maximum latency: %.2lf usecs\n", + time_to_usecs(max_tim)); + exit(0); +} + +#define HZ 300 + +int main (int argc, char **argv) +{ + unsigned int nr_requests = 0; + + if (argc > 2) { + fprintf(stderr, "usage: testlpp []\n"); + exit(-1); + } + if (argc == 2) + nr_requests = atol(argv[1]); + + if (getuid() != 0) { + fprintf(stderr, "need to run as root!\n"); + exit(-1); + } + mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1)); + + fd = open("/dev/lpptest", O_RDWR); + if (fd == -1) { + fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n"); + exit(-1); + } + + signal(SIGINT,&cleanup); + + ioctl (fd, LPPTEST_DISABLE, &tim); + + fprintf(stderr, "calibrating cycles to usecs: "); + cycles_per_sec = calibrate_loop(); + fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000); + if (nr_requests) + fprintf(stderr, "[max # of requests: %u]\n", nr_requests); + fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ); + + while(1) { + ioctl (fd, LPPTEST_TEST, &tim); + if (tim == 0) + printf ("No response from target.\n"); + else { + if (tim > max_tim) { + printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim); + max_tim = tim; + } + if (tim < min_tim) + min_tim = tim; + total++; + if (total == nr_requests) + break; + sum_tim += tim; + } + usleep(1000000/HZ); + } + cleanup(0); + + return 0; +} + + Index: 2.6-8xx/scripts/Makefile =================================================================== --- 2.6-8xx.orig/scripts/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/scripts/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -12,6 +12,7 @@ hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c +hostprogs-$(CONFIG_LPPTEST) += testlpp always := $(hostprogs-y) Index: 2.6-8xx/ipc/sem.c =================================================================== --- 2.6-8xx.orig/ipc/sem.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/ipc/sem.c 2005-06-16 13:45:08.000000000 -0300 @@ -359,6 +359,11 @@ if (error <= 0) { struct sem_queue *n; remove_from_queue(sma,q); + /* + * make sure that the wakeup doesnt preempt + * _this_ cpu prematurely. (on preempt_rt) + */ + preempt_disable(); q->status = IN_WAKEUP; /* * Continue scanning. The next operation @@ -380,6 +385,7 @@ * writing q->status. */ q->status = error; + preempt_enable(); q = n; } else { q = q->next; Index: 2.6-8xx/init/main.c =================================================================== --- 2.6-8xx.orig/init/main.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/init/main.c 2005-06-16 13:45:08.000000000 -0300 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -379,10 +380,12 @@ static void noinline rest_init(void) __releases(kernel_lock) { + system_state = SYSTEM_BOOTING_SCHEDULER_OK; + kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); unlock_kernel(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); cpu_idle(); } @@ -425,6 +428,14 @@ { char * command_line; extern struct kernel_param __start___param[], __stop___param[]; +#ifdef CONFIG_PREEMPT_RT + /* + * Force the soft IRQ state to mimic the hard state until + * we finish boot-up. + */ + local_irq_disable(); +#endif + /* * Interrupts are still disabled. Do necessary setups, then * enable them @@ -453,8 +464,16 @@ * fragile until we cpu_idle() for the first time. */ preempt_disable(); +#ifdef CONFIG_PREEMPT_RT + /* + * Reset the irqs off flag after sched_init resets the preempt_count. + */ + local_irq_disable(); +#endif + build_all_zonelists(); page_alloc_init(); + early_init_hardirqs(); printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line); parse_early_param(); parse_args("Booting kernel", command_line, __start___param, @@ -478,7 +497,12 @@ if (panic_later) panic(panic_later, panic_param); profile_init(); - local_irq_enable(); + + /* + * Soft IRQ state will be enabled with the hard state. + */ + raw_local_irq_enable(); + #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && initrd_start < min_low_pfn << PAGE_SHIFT) { @@ -522,6 +546,9 @@ acpi_early_init(); /* before LAPIC and SMP init */ +#ifdef CONFIG_PREEMPT_RT + WARN_ON(raw_irqs_disabled()); +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -564,6 +591,12 @@ msg = "disabled interrupts"; local_irq_enable(); } +#ifdef CONFIG_PREEMPT_RT + if (raw_irqs_disabled()) { + msg = "disabled hard interrupts"; + raw_local_irq_enable(); + } +#endif if (msg) { printk(KERN_WARNING "error in initcall at 0x%p: " "returned with %s\n", *call, msg); @@ -601,12 +634,15 @@ static void do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); + extern int spawn_desched_task(void); #ifdef CONFIG_SMP extern int migration_init(void); migration_init(); #endif spawn_ksoftirqd(); + spawn_desched_task(); + spawn_softlockup_task(); } static void run_init_process(char *init_filename) @@ -653,6 +689,8 @@ /* Sets up cpus_possible() */ smp_prepare_cpus(max_cpus); + init_hardirqs(); + do_pre_smp_initcalls(); fixup_cpu_present_map(); @@ -678,6 +716,50 @@ else prepare_namespace(); +#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_LOCKING_MODE) + defined(CONFIG_RT_DEADLOCK_DETECT) + defined(CONFIG_DEBUG_PREEMPT) + defined(CONFIG_CRITICAL_PREEMPT_TIMING) + defined(CONFIG_CRITICAL_IRQSOFF_TIMING) + defined(CONFIG_LATENCY_TRACE) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_PAGEALLOC)) + +#if DEBUG_COUNT > 0 + printk(KERN_ERR "*****************************************************************************\n"); + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* WARNING, the following debugging option is turned on in your .config: *\n"); +#else + printk(KERN_ERR "* WARNING, the following debugging options are turned on in your .config: *\n"); +#endif + printk(KERN_ERR "* *\n"); +#ifdef CONFIG_DEBUG_RT_LOCKING_MODE + printk(KERN_ERR "* CONFIG_DEBUG_RT_LOCKING_MODE *\n"); +#endif +#ifdef CONFIG_RT_DEADLOCK_DETECT + printk(KERN_ERR "* CONFIG_RT_DEADLOCK_DETECT *\n"); +#endif +#ifdef CONFIG_DEBUG_PREEMPT + printk(KERN_ERR "* CONFIG_DEBUG_PREEMPT *\n"); +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + printk(KERN_ERR "* CONFIG_CRITICAL_PREEMPT_TIMING *\n"); +#endif +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + printk(KERN_ERR "* CONFIG_CRITICAL_IRQSOFF_TIMING *\n"); +#endif +#ifdef CONFIG_LATENCY_TRACE + printk(KERN_ERR "* CONFIG_LATENCY_TRACE *\n"); +#endif +#ifdef CONFIG_DEBUG_SLAB + printk(KERN_ERR "* CONFIG_DEBUG_SLAB *\n"); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_ERR "* CONFIG_DEBUG_PAGEALLOC *\n"); +#endif + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* it may increase runtime overhead and latencies considerably! *\n"); +#else + printk(KERN_ERR "* they may increase runtime overhead and latencies considerably! *\n"); +#endif + printk(KERN_ERR "* *\n"); + printk(KERN_ERR "*****************************************************************************\n"); +#endif /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the @@ -700,6 +782,9 @@ * The Bourne shell can be used instead of init if we are * trying to recover a really broken machine. */ +#ifdef CONFIG_PREEMPT_RT + WARN_ON(raw_irqs_disabled()); +#endif if (execute_command) run_init_process(execute_command); Index: 2.6-8xx/arch/ppc/8xx_io/fec.c =================================================================== --- 2.6-8xx.orig/arch/ppc/8xx_io/fec.c 2005-06-16 13:44:27.000000000 -0300 +++ 2.6-8xx/arch/ppc/8xx_io/fec.c 2005-06-16 13:45:08.000000000 -0300 @@ -166,7 +166,7 @@ struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: 2.6-8xx/arch/ppc/8xx_io/cs4218_tdm.c =================================================================== --- 2.6-8xx.orig/arch/ppc/8xx_io/cs4218_tdm.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/8xx_io/cs4218_tdm.c 2005-06-16 13:45:08.000000000 -0300 @@ -1380,7 +1380,7 @@ spin_unlock_irqrestore(&cs4218_lock, flags); } -static struct timer_list beep_timer = TIMER_INITIALIZER(cs_nosound, 0, 0); +static DEFINE_TIMER(beep_timer, cs_nosound, 0, 0); }; static void cs_mksound(unsigned int hz, unsigned int ticks) Index: 2.6-8xx/arch/ppc/8xx_io/enet.c =================================================================== --- 2.6-8xx.orig/arch/ppc/8xx_io/enet.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/8xx_io/enet.c 2005-06-16 13:45:08.000000000 -0300 @@ -146,7 +146,7 @@ unsigned char *rx_vaddr[RX_RING_SIZE]; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: 2.6-8xx/arch/ppc/8xx_io/commproc.c =================================================================== --- 2.6-8xx.orig/arch/ppc/8xx_io/commproc.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/8xx_io/commproc.c 2005-06-16 13:45:08.000000000 -0300 @@ -373,7 +373,7 @@ /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* * 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... Index: 2.6-8xx/arch/ppc/boot/Makefile =================================================================== --- 2.6-8xx.orig/arch/ppc/boot/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/boot/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -11,6 +11,15 @@ # CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: 2.6-8xx/arch/ppc/Kconfig =================================================================== --- 2.6-8xx.orig/arch/ppc/Kconfig 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/Kconfig 2005-06-16 13:45:08.000000000 -0300 @@ -17,9 +17,16 @@ config RWSEM_GENERIC_SPINLOCK bool + depends on !PREEMPT_RT + +config ASM_SEMAPHORES + bool + depends on !PREEMPT_RT + default y config RWSEM_XCHGADD_ALGORITHM bool + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT default y config GENERIC_CALIBRATE_DELAY @@ -900,15 +907,7 @@ depends on SMP default "4" -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source "lib/Kconfig.RT" config HIGHMEM bool "High memory support" Index: 2.6-8xx/arch/ppc/syslib/ppc4xx_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/ppc4xx_setup.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/ppc4xx_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -174,6 +174,7 @@ freq = fw_get_tbfreq(); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero. ** At 200 Mhz, time base will rollover in ~2925 years. Index: 2.6-8xx/arch/ppc/syslib/open_pic2.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/open_pic2.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/open_pic2.c 2005-06-16 13:45:08.000000000 -0300 @@ -386,7 +386,7 @@ vec); } -static DEFINE_SPINLOCK(openpic2_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic2_setup_lock); /* * Initialize a timer interrupt (and disable it) Index: 2.6-8xx/arch/ppc/syslib/m8xx_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/m8xx_setup.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/m8xx_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -161,6 +161,7 @@ printk("Decrementer Frequency = %d/%d\n", freq, divisor); tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Perform some more timer/timebase initialization. This used * to be done elsewhere, but other changes caused it to get Index: 2.6-8xx/arch/ppc/syslib/ocp.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/ocp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/ocp.c 2005-06-16 13:45:08.000000000 -0300 @@ -45,11 +45,11 @@ #include #include #include +#include #include #include #include -#include #include //#define DBG(x) printk x Index: 2.6-8xx/arch/ppc/syslib/prom.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/prom.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/prom.c 2005-06-16 13:45:08.000000000 -0300 @@ -1397,7 +1397,7 @@ } #endif -static DEFINE_SPINLOCK(rtas_lock); +static DEFINE_RAW_SPINLOCK(rtas_lock); /* this can be called after setup -- Cort */ int __openfirmware Index: 2.6-8xx/arch/ppc/syslib/open_pic.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/open_pic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/open_pic.c 2005-06-16 13:45:08.000000000 -0300 @@ -528,7 +528,7 @@ } #if defined(CONFIG_SMP) || defined(CONFIG_PM) -static DEFINE_SPINLOCK(openpic_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic_setup_lock); #endif #ifdef CONFIG_SMP Index: 2.6-8xx/arch/ppc/syslib/mpc52xx_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/mpc52xx_setup.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/mpc52xx_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -213,6 +213,7 @@ tb_ticks_per_jiffy = xlbfreq / HZ / divisor; tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000); + cpu_khz = (xlbfreq / divisor) / 1000; } int mpc52xx_match_psc_function(int psc_idx, const char *func) Index: 2.6-8xx/arch/ppc/syslib/m8260_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/m8260_setup.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/m8260_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -77,6 +77,7 @@ divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } /* The 8260 has an internal 1-second timer update register that Index: 2.6-8xx/arch/ppc/syslib/todc_time.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/todc_time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/todc_time.c 2005-06-16 13:45:08.000000000 -0300 @@ -508,6 +508,7 @@ tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: 2.6-8xx/arch/ppc/syslib/ppc85xx_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/ppc85xx_setup.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/ppc85xx_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -55,6 +55,7 @@ divisor = 8; tb_ticks_per_jiffy = freq / divisor / HZ; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: 2.6-8xx/arch/ppc/syslib/ibm44x_common.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/ibm44x_common.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/ibm44x_common.c 2005-06-16 13:45:08.000000000 -0300 @@ -60,6 +60,7 @@ { tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: 2.6-8xx/arch/ppc/syslib/cpm2_common.c =================================================================== --- 2.6-8xx.orig/arch/ppc/syslib/cpm2_common.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/syslib/cpm2_common.c 2005-06-16 13:45:08.000000000 -0300 @@ -116,7 +116,7 @@ /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... */ static rh_block_t cpm_boot_dpmem_rh_block[16]; Index: 2.6-8xx/arch/ppc/kernel/dma-mapping.c =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/dma-mapping.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/dma-mapping.c 2005-06-16 13:45:08.000000000 -0300 @@ -71,7 +71,7 @@ * This is the page table (2MB) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: 2.6-8xx/arch/ppc/kernel/semaphore.c =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/semaphore.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/semaphore.c 2005-06-16 13:45:08.000000000 -0300 @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: 2.6-8xx/arch/ppc/kernel/time.c =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/time.c 2005-06-16 13:45:08.000000000 -0300 @@ -86,10 +86,11 @@ unsigned tb_to_us; unsigned tb_last_stamp; unsigned long tb_to_ns_scale; +unsigned long cpu_khz; extern unsigned long wall_jiffies; -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); @@ -293,6 +294,7 @@ tb_ticks_per_jiffy = DECREMENTER_COUNT_601; /* mulhwu_scale_factor(1000000000, 1000000) is 0x418937 */ tb_to_us = 0x418937; + cpu_khz = 1000000000 / 1000; } else { ppc_md.calibrate_decr(); tb_to_ns_scale = mulhwu(tb_to_us, 1000 << 10); Index: 2.6-8xx/arch/ppc/kernel/entry.S =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/entry.S 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/entry.S 2005-06-16 14:37:30.000000000 -0300 @@ -1026,3 +1026,85 @@ /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_OF */ + +#ifdef CONFIG_MCOUNT + +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + bl __trace +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function preamble, + * before the stack frame is created. An example of this preamble code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + bl __trace +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: 2.6-8xx/arch/ppc/kernel/Makefile =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -14,8 +14,9 @@ obj-y := entry.o traps.o irq.o idle.o time.o misc.o \ process.o signal.o ptrace.o align.o \ - semaphore.o syscalls.o setup.o \ + syscalls.o setup.o \ cputable.o ppc_htab.o perfmon.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_6xx) += l2cr.o cpu_setup_6xx.o obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o obj-$(CONFIG_POWER4) += cpu_setup_power4.o Index: 2.6-8xx/arch/ppc/kernel/traps.c =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/traps.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/traps.c 2005-06-16 13:45:08.000000000 -0300 @@ -72,7 +72,7 @@ * Trap & Exception support */ -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * fp, long err) { Index: 2.6-8xx/arch/ppc/kernel/ppc_ksyms.c =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/ppc_ksyms.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/ppc_ksyms.c 2005-06-16 14:37:30.000000000 -0300 @@ -295,9 +295,11 @@ EXPORT_SYMBOL(xmon); EXPORT_SYMBOL(xmon_printf); #endif -EXPORT_SYMBOL(__up); -EXPORT_SYMBOL(__down); -EXPORT_SYMBOL(__down_interruptible); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_up); +EXPORT_SYMBOL(__compat_down); +EXPORT_SYMBOL(__compat_down_interruptible); +#endif #if defined(CONFIG_KGDB) || defined(CONFIG_XMON) extern void (*debugger)(struct pt_regs *regs); Index: 2.6-8xx/arch/ppc/kernel/smp.c =================================================================== --- 2.6-8xx.orig/arch/ppc/kernel/smp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/kernel/smp.c 2005-06-16 13:45:08.000000000 -0300 @@ -137,6 +137,11 @@ smp_message_pass(cpu, PPC_MSG_RESCHEDULE, 0, 0); } +void smp_send_reschedule_allbutself(void) +{ + smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0); +} + #ifdef CONFIG_XMON void smp_send_xmon_break(int cpu) { @@ -161,7 +166,7 @@ * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: 2.6-8xx/arch/ppc/8260_io/fcc_enet.c =================================================================== --- 2.6-8xx.orig/arch/ppc/8260_io/fcc_enet.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/8260_io/fcc_enet.c 2005-06-16 13:45:08.000000000 -0300 @@ -378,7 +378,7 @@ volatile fcc_enet_t *ep; struct net_device_stats stats; uint tx_free; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: 2.6-8xx/arch/ppc/8260_io/enet.c =================================================================== --- 2.6-8xx.orig/arch/ppc/8260_io/enet.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/8260_io/enet.c 2005-06-16 13:45:08.000000000 -0300 @@ -117,7 +117,7 @@ scc_t *sccp; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: 2.6-8xx/arch/ppc/platforms/ev64260.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/ev64260.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/ev64260.c 2005-06-16 13:45:08.000000000 -0300 @@ -552,6 +552,7 @@ tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: 2.6-8xx/arch/ppc/platforms/chrp_time.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/chrp_time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/chrp_time.c 2005-06-16 13:45:08.000000000 -0300 @@ -189,4 +189,5 @@ freq/1000000, freq%1000000); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; } Index: 2.6-8xx/arch/ppc/platforms/powerpmc250.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/powerpmc250.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/powerpmc250.c 2005-06-16 13:45:08.000000000 -0300 @@ -167,6 +167,7 @@ tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void Index: 2.6-8xx/arch/ppc/platforms/pmac_time.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/pmac_time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/pmac_time.c 2005-06-16 13:45:08.000000000 -0300 @@ -197,6 +197,7 @@ tb_ticks_per_jiffy = (dstart - dend) / (6 * (HZ/100)); tb_to_us = mulhwu_scale_factor(dstart - dend, 60000); + cpu_khz = (dstart - dend) / 60; printk(KERN_INFO "via_calibrate_decr: ticks per jiffy = %u (%u ticks)\n", tb_ticks_per_jiffy, dstart - dend); @@ -288,4 +289,5 @@ freq/1000000, freq%1000000); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; } Index: 2.6-8xx/arch/ppc/platforms/adir_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/adir_setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/adir_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -77,6 +77,7 @@ freq = adir_get_bus_speed(); tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static int Index: 2.6-8xx/arch/ppc/platforms/pmac_pic.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/pmac_pic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/pmac_pic.c 2005-06-16 13:45:08.000000000 -0300 @@ -68,7 +68,7 @@ static int max_real_irqs __pmacdata; static u32 level_mask[4] __pmacdata; -static DEFINE_SPINLOCK(pmac_pic_lock __pmacdata); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock __pmacdata); #define GATWICK_IRQ_POOL_SIZE 10 Index: 2.6-8xx/arch/ppc/platforms/gemini_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/gemini_setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/gemini_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -462,6 +462,7 @@ divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } unsigned long __init gemini_find_end_of_memory(void) Index: 2.6-8xx/arch/ppc/platforms/pmac_nvram.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/pmac_nvram.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/pmac_nvram.c 2005-06-16 13:45:08.000000000 -0300 @@ -80,7 +80,7 @@ static int nvram_mult, is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); extern int pmac_newworld; extern int system_running; Index: 2.6-8xx/arch/ppc/platforms/k2.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/k2.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/k2.c 2005-06-16 13:45:08.000000000 -0300 @@ -411,6 +411,7 @@ freq = k2_get_bus_speed(); tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static int k2_show_cpuinfo(struct seq_file *m) Index: 2.6-8xx/arch/ppc/platforms/spruce.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/spruce.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/spruce.c 2005-06-16 13:45:08.000000000 -0300 @@ -150,6 +150,7 @@ freq = SPRUCE_BUS_SPEED; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static int Index: 2.6-8xx/arch/ppc/platforms/apus_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/apus_setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/apus_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -282,6 +282,7 @@ freq/1000000, freq%1000000); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; __bus_speed = bus_speed; __speed_test_failed = speed_test_failed; Index: 2.6-8xx/arch/ppc/platforms/sbc82xx.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/sbc82xx.c 2005-06-16 13:39:47.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/sbc82xx.c 2005-06-16 13:45:08.000000000 -0300 @@ -67,7 +67,7 @@ static volatile char *sbc82xx_i8259_map; static char sbc82xx_i8259_mask = 0xff; -static DEFINE_SPINLOCK(sbc82xx_i8259_lock); +static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock); static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr) { Index: 2.6-8xx/arch/ppc/platforms/prpmc800.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/prpmc800.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/prpmc800.c 2005-06-16 13:45:08.000000000 -0300 @@ -331,6 +331,7 @@ tb_ticks_per_second = 100000000 / 4; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; return; } @@ -371,6 +372,7 @@ tb_ticks_per_second = (tbl_end - tbl_start) * 2; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; } static void prpmc800_restart(char *cmd) Index: 2.6-8xx/arch/ppc/platforms/prep_setup.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/prep_setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/prep_setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -938,6 +938,7 @@ (freq/divisor)/1000000, (freq/divisor)%1000000); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; tb_ticks_per_jiffy = freq / HZ / divisor; } } Index: 2.6-8xx/arch/ppc/platforms/pmac_feature.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/pmac_feature.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/pmac_feature.c 2005-06-16 13:45:08.000000000 -0300 @@ -63,7 +63,7 @@ * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -static DEFINE_SPINLOCK(feature_lock __pmacdata); +static DEFINE_RAW_SPINLOCK(feature_lock __pmacdata); #define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); #define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); Index: 2.6-8xx/arch/ppc/platforms/prpmc750.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/prpmc750.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/prpmc750.c 2005-06-16 13:45:08.000000000 -0300 @@ -271,6 +271,7 @@ tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void prpmc750_restart(char *cmd) Index: 2.6-8xx/arch/ppc/platforms/chrp_smp.c =================================================================== --- 2.6-8xx.orig/arch/ppc/platforms/chrp_smp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/platforms/chrp_smp.c 2005-06-16 13:45:08.000000000 -0300 @@ -57,7 +57,7 @@ do_openpic_setup_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; void __devinit Index: 2.6-8xx/arch/ppc/lib/dec_and_lock.c =================================================================== --- 2.6-8xx.orig/arch/ppc/lib/dec_and_lock.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/lib/dec_and_lock.c 2005-06-16 13:45:08.000000000 -0300 @@ -19,7 +19,7 @@ */ #ifndef ATOMIC_DEC_AND_LOCK -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock) { int counter; int newcount; @@ -35,12 +35,12 @@ return 0; } - spin_lock(lock); + _raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; - spin_unlock(lock); + _raw_spin_unlock(lock); return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock); #endif /* ATOMIC_DEC_AND_LOCK */ Index: 2.6-8xx/arch/ppc/lib/locks.c =================================================================== --- 2.6-8xx.orig/arch/ppc/lib/locks.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/ppc/lib/locks.c 2005-06-16 13:45:08.000000000 -0300 @@ -43,7 +43,7 @@ return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -63,9 +63,9 @@ lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -73,9 +73,9 @@ lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -89,13 +89,13 @@ wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -115,13 +115,13 @@ return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -136,9 +136,9 @@ } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -147,9 +147,9 @@ wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -165,18 +165,18 @@ } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -185,6 +185,6 @@ wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: 2.6-8xx/arch/x86_64/mm/fault.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/mm/fault.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/mm/fault.c 2005-06-16 13:45:08.000000000 -0300 @@ -39,6 +39,7 @@ { int loglevel_save = console_loglevel; if (yes) { + stop_trace(); oops_in_progress = 1; } else { #ifdef CONFIG_VT @@ -326,7 +327,7 @@ return; if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); + raw_local_irq_enable(); if (unlikely(page_fault_trace)) printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", Index: 2.6-8xx/arch/x86_64/Kconfig =================================================================== --- 2.6-8xx.orig/arch/x86_64/Kconfig 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/Kconfig 2005-06-16 13:45:08.000000000 -0300 @@ -34,13 +34,6 @@ config SBUS bool -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_CALIBRATE_DELAY bool default y @@ -207,33 +200,6 @@ If you don't know what to do here, say N. -config PREEMPT - bool "Preemptible Kernel" - ---help--- - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. On contrary it may also break your drivers and add - priority inheritance problems to your system. Don't select it if - you rely on a stable system or have slightly obscure hardware. - It's also not very well tested on x86-64 currently. - You have been warned. - - Say Y here if you are feeling brave and building a kernel for a - desktop, embedded or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. - config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP @@ -244,6 +210,16 @@ cost of slightly increased overhead in some places. If unsure say N here. +source "lib/Kconfig.RT" + +config RWSEM_GENERIC_SPINLOCK + bool + depends on PREEMPT_RT + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + config K8_NUMA bool "K8 NUMA support" select NUMA Index: 2.6-8xx/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/x8664_ksyms.c 2005-06-16 13:45:08.000000000 -0300 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -33,8 +34,6 @@ #include #include -extern spinlock_t rtc_lock; - #ifdef CONFIG_SMP extern void __write_lock_failed(rwlock_t *rw); extern void __read_lock_failed(rwlock_t *rw); @@ -62,10 +61,12 @@ EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(get_cmos_time); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_nocheck); EXPORT_SYMBOL(ip_compute_csum); @@ -179,7 +180,7 @@ EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock); #endif EXPORT_SYMBOL(die_chain); Index: 2.6-8xx/arch/x86_64/kernel/io_apic.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/io_apic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/io_apic.c 2005-06-16 13:45:08.000000000 -0300 @@ -45,7 +45,7 @@ static int no_timer_check; -static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); /* * # of IRQ routing registers @@ -115,6 +115,9 @@ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ io_apic_modify(entry->apic, reg); \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -127,10 +130,8 @@ static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -1064,7 +1065,6 @@ void __apicdebuginit print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1218,7 +1218,7 @@ { unsigned long t1 = jiffies; - local_irq_enable(); + raw_local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); @@ -1310,11 +1310,48 @@ return 0; /* don't check for pending */ } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + * + * (In the non-preemptible case we keep the IRQ unacked in the local APIC + * and dont need to do the masking, because the code executes atomically.) + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +static void end_level_ioapic_irq(unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_IO_APIC_irq(irq); +} + +static void enable_level_ioapic_irq(unsigned int irq) +{ + unmask_IO_APIC_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ +} + static void end_level_ioapic_irq (unsigned int irq) { ack_APIC_irq(); } +static void enable_level_ioapic_irq(unsigned int irq) +{ + unmask_IO_APIC_irq(irq); +} +#endif /* !CONFIG_PREEMPT_HARDIRQS */ + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { unsigned long flags; @@ -1354,6 +1391,13 @@ return startup_level_ioapic_irq (irq); } +static void mask_and_ack_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_and_ack_level_ioapic_irq(irq); +} + static void end_level_ioapic_vector (unsigned int vector) { int irq = vector_to_irq(vector); @@ -1361,6 +1405,11 @@ end_level_ioapic_irq(irq); } +static void enable_level_ioapic_vector(unsigned int vector) +{ + enable_level_ioapic_irq(vector_to_irq(vector)); +} + static void mask_IO_APIC_vector (unsigned int vector) { int irq = vector_to_irq(vector); Index: 2.6-8xx/arch/x86_64/kernel/apic.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/apic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/apic.c 2005-06-16 13:45:08.000000000 -0300 @@ -478,10 +478,9 @@ apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + raw_local_irq_save(flags); disable_local_APIC(); - local_irq_restore(flags); + raw_local_irq_restore(flags); return 0; } @@ -496,7 +495,7 @@ /* XXX: Pavel needs this for S3 resume, but can't explain why */ set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); - local_irq_save(flags); + raw_local_irq_save(flags); rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; @@ -519,7 +518,7 @@ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - local_irq_restore(flags); + raw_local_irq_restore(flags); return 0; } @@ -676,7 +675,7 @@ { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); /* For some reasons this doesn't work on Simics, so fake it for now */ if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { @@ -706,7 +705,7 @@ __setup_APIC_LVTT(clocks); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -763,7 +762,7 @@ printk(KERN_INFO "Using local APIC timer interrupts.\n"); using_apic_timer = 1; - local_irq_disable(); + raw_local_irq_disable(); calibration_result = calibrate_APIC_clock(); /* @@ -771,14 +770,14 @@ */ setup_APIC_timer(calibration_result); - local_irq_enable(); + raw_local_irq_enable(); } void __init setup_secondary_APIC_clock(void) { - local_irq_disable(); /* FIXME: Do we need this? --RR */ + raw_local_irq_disable(); /* FIXME: Do we need this? --RR */ setup_APIC_timer(calibration_result); - local_irq_enable(); + raw_local_irq_enable(); } void __init disable_APIC_timer(void) Index: 2.6-8xx/arch/x86_64/kernel/semaphore.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/semaphore.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/semaphore.c 2005-06-16 13:45:08.000000000 -0300 @@ -50,12 +50,12 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down(struct semaphore * sem) +void __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -92,7 +92,7 @@ tsk->state = TASK_RUNNING; } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -155,7 +155,7 @@ * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) +int __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; Index: 2.6-8xx/arch/x86_64/kernel/smpboot.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/smpboot.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/smpboot.c 2005-06-16 13:45:08.000000000 -0300 @@ -207,7 +207,7 @@ go[MASTER] = 0; - local_irq_save(flags); + raw_local_irq_save(flags); { for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { while (!go[MASTER]) @@ -216,7 +216,7 @@ rdtscll(go[SLAVE]); } } - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* Index: 2.6-8xx/arch/x86_64/kernel/reboot.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/reboot.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/reboot.c 2005-06-16 13:45:08.000000000 -0300 @@ -114,12 +114,12 @@ #endif if (!reboot_force) { - local_irq_disable(); + raw_local_irq_disable(); #ifndef CONFIG_SMP disable_local_APIC(); #endif disable_IO_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } /* Tell the BIOS if we want cold or warm reboot */ Index: 2.6-8xx/arch/x86_64/kernel/time.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/time.c 2005-06-16 13:45:08.000000000 -0300 @@ -53,8 +53,8 @@ extern void i8254_timer_resume(void); extern int using_apic_timer; -DEFINE_SPINLOCK(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); static int nohpet __initdata = 0; static int notsc __initdata = 0; @@ -871,7 +871,7 @@ } static struct irqaction irq0 = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL }; extern void __init config_acpi_tables(void); @@ -1027,6 +1027,7 @@ write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; wall_jiffies += sleep_length; + touch_softlockup_watchdog(); return 0; } Index: 2.6-8xx/arch/x86_64/kernel/init_task.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/init_task.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/init_task.c 2005-06-16 13:45:08.000000000 -0300 @@ -11,7 +11,7 @@ #include static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; +static struct files_struct init_files = INIT_FILES(init_files); static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); Index: 2.6-8xx/arch/x86_64/kernel/signal.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/signal.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/signal.c 2005-06-16 13:45:08.000000000 -0300 @@ -411,6 +411,13 @@ siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + raw_local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: 2.6-8xx/arch/x86_64/kernel/nmi.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/nmi.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/nmi.c 2005-06-16 13:45:08.000000000 -0300 @@ -44,7 +44,7 @@ * This is maintained separately from nmi_active because the NMI * watchdog may also be driven from the I/O APIC timer. */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); +static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock); static unsigned int lapic_nmi_owner; #define LAPIC_NMI_WATCHDOG (1<<0) #define LAPIC_NMI_RESERVED (1<<1) @@ -128,7 +128,7 @@ static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; - local_irq_enable(); + raw_local_irq_enable(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -157,7 +157,7 @@ for (cpu = 0; cpu < NR_CPUS; cpu++) counts[cpu] = cpu_pda[cpu].__nmi_count; - local_irq_enable(); + raw_local_irq_enable(); mdelay((10*1000)/nmi_hz); // wait 10 ticks for (cpu = 0; cpu < NR_CPUS; cpu++) { @@ -463,14 +463,46 @@ */ for (i = 0; i < NR_CPUS; i++) per_cpu(nmi_touch, i) = 1; + + touch_softlockup_watchdog(); } +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + int cpu = safe_smp_processor_id(); sum = read_pda(apic_timer_irqs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -482,6 +514,11 @@ */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for (i = 0; i < NR_CPUS; i++) + nmi_show_regs[i] = 1; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { local_set(&__get_cpu_var(alert_counter), 0); Index: 2.6-8xx/arch/x86_64/kernel/entry.S =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/entry.S 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/entry.S 2005-06-16 13:45:08.000000000 -0300 @@ -211,8 +211,8 @@ /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal sti pushq %rdi call schedule @@ -231,7 +231,7 @@ leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi jmp sysret_check /* Do syscall tracing */ @@ -280,8 +280,8 @@ /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful sti pushq %rdi call schedule @@ -310,7 +310,7 @@ movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -478,8 +478,8 @@ /* edi: workmask, edx: work */ retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal sti pushq %rdi call schedule @@ -499,7 +499,7 @@ call do_notify_resume RESTORE_REST cli - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -918,3 +918,40 @@ ENTRY(call_debug) zeroentry do_call_debug +#ifdef CONFIG_LATENCY_TRACE + +ENTRY(mcount) + cmpq $0, trace_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + + leaveq +out: + ret + +#endif + Index: 2.6-8xx/arch/x86_64/kernel/process.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/process.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/process.c 2005-06-16 13:45:08.000000000 -0300 @@ -85,12 +85,13 @@ void default_idle(void) { if (!atomic_read(&hlt_counter)) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); + raw_local_irq_disable(); + if (!need_resched() && !need_resched_delayed()) + raw_safe_halt(); else - local_irq_enable(); - } + raw_local_irq_enable(); + } else + raw_local_irq_enable(); } /* @@ -102,7 +103,7 @@ { int oldval; - local_irq_enable(); + raw_local_irq_enable(); /* * Deal with another CPU just having chosen a thread to @@ -118,7 +119,7 @@ "rep; nop;" "je 2b;" : : - "i" (_TIF_NEED_RESCHED), + "i" (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED), "m" (current_thread_info()->flags)); } else { set_need_resched(); @@ -163,7 +164,9 @@ { /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(raw_irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -173,10 +176,13 @@ idle = pm_idle; if (!idle) idle = default_idle; + stop_critical_timing(); + propagate_preempt_locks_value(); idle(); } - - schedule(); + raw_local_irq_disable(); + __schedule(); + raw_local_irq_enable(); } } @@ -189,16 +195,16 @@ */ static void mwait_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { set_thread_flag(TIF_POLLING_NRFLAG); do { __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) + if (need_resched() || need_resched_delayed()) break; __mwait(0, 0); - } while (!need_resched()); + } while (!need_resched() && !need_resched_delayed()); clear_thread_flag(TIF_POLLING_NRFLAG); } } @@ -283,7 +289,7 @@ void show_regs(struct pt_regs *regs) { __show_regs(regs); - show_trace(®s->rsp); + show_trace(current, ®s->rsp); } /* @@ -294,13 +300,14 @@ struct task_struct *me = current; struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: 2.6-8xx/arch/x86_64/kernel/Makefile =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -4,11 +4,12 @@ extra-y := head.o head64.o init_task.o vmlinux.lds EXTRA_AFLAGS := -traditional -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += semaphore.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ Index: 2.6-8xx/arch/x86_64/kernel/traps.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/traps.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/traps.c 2005-06-16 13:45:08.000000000 -0300 @@ -90,7 +90,7 @@ static inline void conditional_sti(struct pt_regs *regs) { if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); + raw_local_irq_enable(); } static int kstack_depth_to_print = 10; @@ -156,7 +156,7 @@ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(unsigned long *stack) +void show_trace(struct task_struct *task, unsigned long *stack) { unsigned long addr; const unsigned cpu = safe_smp_processor_id(); @@ -221,6 +221,7 @@ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK printk("\n"); + print_traces(task); } void show_stack(struct task_struct *tsk, unsigned long * rsp) @@ -257,7 +258,7 @@ printk("%016lx ", *stack++); touch_nmi_watchdog(); } - show_trace((unsigned long *)rsp); + show_trace(tsk, (unsigned long *)rsp); } /* @@ -266,7 +267,7 @@ void dump_stack(void) { unsigned long dummy; - show_trace(&dummy); + show_trace(current, &dummy); } EXPORT_SYMBOL(dump_stack); @@ -339,14 +340,14 @@ } #endif -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); static int die_owner = -1; void oops_begin(void) { int cpu = safe_smp_processor_id(); /* racy, but better than risking deadlock. */ - local_irq_disable(); + raw_local_irq_disable(); if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; Index: 2.6-8xx/arch/x86_64/kernel/i8259.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/i8259.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/i8259.c 2005-06-16 13:45:08.000000000 -0300 @@ -130,7 +130,7 @@ * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { @@ -440,7 +440,7 @@ * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init init_ISA_irqs (void) { Index: 2.6-8xx/arch/x86_64/kernel/smp.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/smp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/smp.c 2005-06-16 13:45:08.000000000 -0300 @@ -42,7 +42,7 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL -1ULL /* @@ -268,10 +268,20 @@ } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -349,9 +359,9 @@ * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } static void smp_really_stop_cpu(void *dummy) @@ -375,9 +385,9 @@ if (!nolock) spin_unlock(&call_lock); - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } /* Index: 2.6-8xx/arch/x86_64/kernel/vsyscall.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/vsyscall.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/vsyscall.c 2005-06-16 13:45:08.000000000 -0300 @@ -38,7 +38,7 @@ #define force_inline __attribute__((always_inline)) inline int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +raw_seqlock_t __xtime_lock __section_xtime_lock = RAW_SEQLOCK_UNLOCKED; #include Index: 2.6-8xx/arch/x86_64/kernel/irq.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/kernel/irq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/kernel/irq.c 2005-06-16 13:45:08.000000000 -0300 @@ -43,7 +43,8 @@ } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + raw_local_irq_save(flags); + spin_lock(&irq_desc[i].lock); action = irq_desc[i].action; if (!action) goto skip; @@ -63,7 +64,8 @@ seq_printf(p, ", %s", action->name); seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock(&irq_desc[i].lock); + raw_local_irq_restore(flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); for (j = 0; j < NR_CPUS; j++) Index: 2.6-8xx/arch/x86_64/lib/dec_and_lock.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/lib/dec_and_lock.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/lib/dec_and_lock.c 2005-06-16 13:45:08.000000000 -0300 @@ -10,7 +10,7 @@ #include #include -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock) { int counter; int newcount; Index: 2.6-8xx/arch/x86_64/lib/thunk.S =================================================================== --- 2.6-8xx.orig/arch/x86_64/lib/thunk.S 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/lib/thunk.S 2005-06-16 13:45:08.000000000 -0300 @@ -43,11 +43,13 @@ thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif thunk do_softirq_thunk,do_softirq - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC Index: 2.6-8xx/arch/x86_64/ia32/sys_ia32.c =================================================================== --- 2.6-8xx.orig/arch/x86_64/ia32/sys_ia32.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/x86_64/ia32/sys_ia32.c 2005-06-16 13:45:08.000000000 -0300 @@ -455,6 +455,10 @@ struct timespec kts; struct timezone ktz; + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (tv) { if (get_tv32(&ktv, tv)) return -EFAULT; Index: 2.6-8xx/arch/mips/Kconfig =================================================================== --- 2.6-8xx.orig/arch/mips/Kconfig 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/Kconfig 2005-06-16 13:45:08.000000000 -0300 @@ -308,6 +308,7 @@ config MOMENCO_OCELOT bool "Support for Momentum Ocelot board" select DMA_NONCOHERENT + select NO_SPINLOCK select HW_HAS_PCI select IRQ_CPU select IRQ_CPU_RM7K @@ -660,6 +661,7 @@ depends on EXPERIMENTAL select BOOT_ELF32 select DMA_COHERENT + select NO_SPINLOCK select SWAP_IO_SPACE choice @@ -903,12 +905,21 @@ bool "FPCIB0 Backplane Support" depends on TOSHIBA_RBTX4927 +source "lib/Kconfig.RT" + config RWSEM_GENERIC_SPINLOCK bool + depends on !PREEMPT_RT default y config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT + +config ASM_SEMAPHORES + bool + depends on !PREEMPT_RT + default y config GENERIC_CALIBRATE_DELAY bool @@ -929,6 +940,9 @@ config DMA_COHERENT bool +config NO_SPINLOCK + bool + config DMA_IP27 bool @@ -1453,15 +1467,6 @@ This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - config RTC_DS1742 bool "DS1742 BRAM/RTC support" depends on TOSHIBA_JMR3927 || TOSHIBA_RBTX4927 @@ -1476,10 +1481,6 @@ This will result in additional memory usage, so it is not recommended for normal users. -config RWSEM_GENERIC_SPINLOCK - bool - default y - endmenu menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" Index: 2.6-8xx/arch/mips/kernel/semaphore.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/semaphore.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/semaphore.c 2005-06-16 13:45:08.000000000 -0300 @@ -63,7 +63,7 @@ : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (incr), "m" (sem->count)); } else { - static DEFINE_SPINLOCK(semaphore_lock); + static DEFINE_RAW_SPINLOCK(semaphore_lock); unsigned long flags; spin_lock_irqsave(&semaphore_lock, flags); Index: 2.6-8xx/arch/mips/kernel/time.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/time.c 2005-06-16 13:45:08.000000000 -0300 @@ -52,7 +52,7 @@ */ extern volatile unsigned long wall_jiffies; -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); /* * By default we provide the null RTC ops @@ -555,7 +555,7 @@ static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = SA_INTERRUPT, + .flags = SA_NODELAY | SA_INTERRUPT, .name = "timer", }; Index: 2.6-8xx/arch/mips/kernel/signal.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/signal.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/signal.c 2005-06-16 13:45:08.000000000 -0300 @@ -449,6 +449,10 @@ } #endif +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: 2.6-8xx/arch/mips/kernel/module.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/module.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/module.c 2005-06-16 13:45:08.000000000 -0300 @@ -2,7 +2,7 @@ #include static LIST_HEAD(dbe_list); -static DEFINE_SPINLOCK(dbe_lock); +static DEFINE_RAW_SPINLOCK(dbe_lock); /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_dbetables(unsigned long addr) Index: 2.6-8xx/arch/mips/kernel/signal32.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/signal32.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/signal32.c 2005-06-16 13:45:08.000000000 -0300 @@ -766,6 +766,10 @@ siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: 2.6-8xx/arch/mips/kernel/entry.S =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/entry.S 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/entry.S 2005-06-16 13:45:08.000000000 -0300 @@ -48,6 +48,8 @@ #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) + lw t0, kernel_preemption + beqz t0, restore_all lw t0, TI_PRE_COUNT($28) bnez t0, restore_all need_resched: @@ -57,12 +59,9 @@ LONG_L t0, PT_STATUS(sp) # Interrupts off? andi t0, 1 beqz t0, restore_all - li t0, PREEMPT_ACTIVE - sw t0, TI_PRE_COUNT($28) - local_irq_enable t0 - jal schedule - sw zero, TI_PRE_COUNT($28) local_irq_disable t0 + jal preempt_schedule_irq + sw zero, TI_PRE_COUNT($28) b need_resched #endif @@ -92,6 +91,7 @@ andi t0, a2, _TIF_NEED_RESCHED beqz t0, work_notifysig work_resched: + local_irq_enable t0 jal schedule local_irq_disable t0 # make sure need_resched and Index: 2.6-8xx/arch/mips/kernel/process.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/process.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/process.c 2005-06-16 13:45:08.000000000 -0300 @@ -58,6 +58,7 @@ while (!need_resched()) if (cpu_wait) (*cpu_wait)(); + local_irq_enable(); schedule(); } } Index: 2.6-8xx/arch/mips/kernel/Makefile =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y += cpu-probe.o branch.o entry.o genex.o irq.o process.o \ - ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \ + ptrace.o reset.o setup.o signal.o syscall.o \ time.o traps.o unaligned.o binfmt_irix-objs := irixelf.o irixinv.o irixioctl.o irixsig.o \ @@ -17,6 +17,8 @@ obj-$(CONFIG_MIPS64) += module-elf64.o endif +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o + obj-$(CONFIG_CPU_R3000) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX39XX) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX49XX) += r4k_fpu.o r4k_switch.o Index: 2.6-8xx/arch/mips/kernel/traps.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/traps.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/traps.c 2005-06-16 13:45:08.000000000 -0300 @@ -250,7 +250,7 @@ printk("\n"); } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); NORET_TYPE void __die(const char * str, struct pt_regs * regs, const char * file, const char * func, unsigned long line) Index: 2.6-8xx/arch/mips/kernel/i8259.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/i8259.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/i8259.c 2005-06-16 13:45:08.000000000 -0300 @@ -31,7 +31,7 @@ * moves to arch independent land */ -spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED; +raw_spinlock_t i8259A_lock = RAW_SPIN_LOCK_UNLOCKED; static void end_8259A_irq (unsigned int irq) { Index: 2.6-8xx/arch/mips/kernel/smp.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/smp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/smp.c 2005-06-16 13:45:08.000000000 -0300 @@ -105,7 +105,22 @@ cpu_idle(); } -DEFINE_SPINLOCK(smp_call_lock); +DEFINE_RAW_SPINLOCK(smp_call_lock); + +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them. + */ +void smp_send_reschedule_allbutself(void) +{ + int cpu = smp_processor_id(); + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i) && i != cpu) + core_send_ipi(i, SMP_RESCHEDULE_YOURSELF); +} struct call_data_struct *call_data; @@ -284,6 +299,8 @@ return 0; } +static DEFINE_RAW_SPINLOCK(tlbstate_lock); + static void flush_tlb_all_ipi(void *info) { local_flush_tlb_all(); @@ -315,6 +332,7 @@ void flush_tlb_mm(struct mm_struct *mm) { preempt_disable(); + spin_lock(&tlbstate_lock); if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1, 1); @@ -324,6 +342,7 @@ if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_mm(mm); preempt_enable(); @@ -347,6 +366,8 @@ struct mm_struct *mm = vma->vm_mm; preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { struct flush_tlb_data fd; @@ -360,6 +381,7 @@ if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_range(vma, start, end); preempt_enable(); } @@ -390,6 +412,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) { struct flush_tlb_data fd; @@ -402,6 +426,7 @@ if (smp_processor_id() != i) cpu_context(i, vma->vm_mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_page(vma, page); preempt_enable(); } Index: 2.6-8xx/arch/mips/kernel/irq.c =================================================================== --- 2.6-8xx.orig/arch/mips/kernel/irq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/kernel/irq.c 2005-06-16 13:45:08.000000000 -0300 @@ -125,7 +125,10 @@ irq_desc[i].action = NULL; irq_desc[i].depth = 1; irq_desc[i].handler = &no_irq_type; - spin_lock_init(&irq_desc[i].lock); + raw_spin_lock_init(&irq_desc[i].lock); +#ifdef CONFIG_PREEMPT_HARDIRQS + irq_desc[i].thread = NULL; +#endif } arch_init_irq(); Index: 2.6-8xx/arch/mips/math-emu/cp1emu.c =================================================================== --- 2.6-8xx.orig/arch/mips/math-emu/cp1emu.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/math-emu/cp1emu.c 2005-06-16 13:45:08.000000000 -0300 @@ -1310,7 +1310,9 @@ if (sig) break; + preempt_enable(); cond_resched(); + preempt_disable(); } while (xcp->cp0_epc > prevepc); /* SIGILL indicates a non-fpu instruction */ Index: 2.6-8xx/arch/mips/lib/dec_and_lock.c =================================================================== --- 2.6-8xx.orig/arch/mips/lib/dec_and_lock.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/lib/dec_and_lock.c 2005-06-16 13:45:08.000000000 -0300 @@ -28,7 +28,7 @@ */ #ifndef ATOMIC_DEC_AND_LOCK -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock) { int counter; int newcount; @@ -44,12 +44,12 @@ return 0; } - spin_lock(lock); + _raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; - spin_unlock(lock); + _raw_spin_unlock(lock); return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock); #endif /* ATOMIC_DEC_AND_LOCK */ Index: 2.6-8xx/arch/mips/sibyte/sb1250/time.c =================================================================== --- 2.6-8xx.orig/arch/mips/sibyte/sb1250/time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/sibyte/sb1250/time.c 2005-06-16 13:45:08.000000000 -0300 @@ -115,10 +115,12 @@ ll_timer_interrupt(irq, regs); } - /* - * every CPU should do profiling and process accouting - */ - ll_local_timer_interrupt(irq, regs); + if (cpu != 0) { + /* + * every CPU should do profiling and process accouting + */ + ll_local_timer_interrupt(irq, regs); + } } /* Index: 2.6-8xx/arch/mips/sibyte/sb1250/irq.c =================================================================== --- 2.6-8xx.orig/arch/mips/sibyte/sb1250/irq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/mips/sibyte/sb1250/irq.c 2005-06-16 13:45:08.000000000 -0300 @@ -88,7 +88,7 @@ /* Store the CPU id (not the logical number) */ int sb1250_irq_owner[SB1250_NR_IRQS]; -DEFINE_SPINLOCK(sb1250_imr_lock); +DEFINE_RAW_SPINLOCK(sb1250_imr_lock); void sb1250_mask_irq(int cpu, int irq) { @@ -276,7 +276,7 @@ static struct irqaction sb1250_dummy_action = { .handler = sb1250_dummy_handler, - .flags = 0, + .flags = SA_NODELAY, .mask = CPU_MASK_NONE, .name = "sb1250-private", .next = NULL, Index: 2.6-8xx/arch/i386/mm/highmem.c =================================================================== --- 2.6-8xx.orig/arch/i386/mm/highmem.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mm/highmem.c 2005-06-16 13:45:08.000000000 -0300 @@ -17,6 +17,27 @@ kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -25,7 +46,7 @@ * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -47,7 +68,7 @@ return (void*) vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -74,7 +95,7 @@ preempt_check_resched(); } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; Index: 2.6-8xx/arch/i386/mm/fault.c =================================================================== --- 2.6-8xx.orig/arch/i386/mm/fault.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mm/fault.c 2005-06-16 13:45:08.000000000 -0300 @@ -38,6 +38,8 @@ int loglevel_save = console_loglevel; if (yes) { + stop_trace(); + zap_rt_locks(); oops_in_progress = 1; return; } @@ -211,7 +213,7 @@ * bit 1 == 0 means read, 1 means write * bit 2 == 0 means kernel, 1 means user-mode */ -fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) +fastcall notrace void do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -223,13 +225,14 @@ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + trace_special(regs->eip, error_code, address); if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); + raw_local_irq_enable(); tsk = current; @@ -440,9 +443,9 @@ } #endif if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + printk(KERN_ALERT "BUG: Unable to handle kernel NULL pointer dereference"); else - printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(KERN_ALERT "BUG: Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); Index: 2.6-8xx/arch/i386/mm/pageattr.c =================================================================== --- 2.6-8xx.orig/arch/i386/mm/pageattr.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mm/pageattr.c 2005-06-16 13:45:08.000000000 -0300 @@ -206,6 +206,9 @@ { if (PageHighMem(page)) return; + if (!enable) + check_no_locks_freed(page_address(page), page_address(page+numpages)); + /* the return value is ignored - the calls cannot fail, * large pages are disabled at boot time. */ Index: 2.6-8xx/arch/i386/mm/pgtable.c =================================================================== --- 2.6-8xx.orig/arch/i386/mm/pgtable.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mm/pgtable.c 2005-06-16 13:45:08.000000000 -0300 @@ -172,7 +172,7 @@ * recommendations and having no core impact whatsoever. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) Index: 2.6-8xx/arch/i386/boot/compressed/misc.c =================================================================== --- 2.6-8xx.orig/arch/i386/boot/compressed/misc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/boot/compressed/misc.c 2005-06-16 13:45:08.000000000 -0300 @@ -14,6 +14,12 @@ #include #include +#ifdef CONFIG_MCOUNT +void notrace mcount(void) +{ +} +#endif + /* * gzip declarations */ @@ -111,7 +117,7 @@ #define INPLACE_MOVE_ROUTINE 0x1000 #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static unsigned int low_buffer_end, low_buffer_size; static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; Index: 2.6-8xx/arch/i386/Kconfig =================================================================== --- 2.6-8xx.orig/arch/i386/Kconfig 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/Kconfig 2005-06-16 13:45:08.000000000 -0300 @@ -369,16 +369,6 @@ default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1 default "6" if MK7 || MK8 || MPENTIUMM -config RWSEM_GENERIC_SPINLOCK - bool - depends on M386 - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - config GENERIC_CALIBRATE_DELAY bool default y @@ -435,7 +425,7 @@ config X86_USE_3DNOW bool - depends on MCYRIXIII || MK7 + depends on (MCYRIXIII || MK7) && !PREEMPT_RT default y config X86_OOSTORE @@ -511,28 +501,22 @@ cost of slightly increased overhead in some places. If unsure say N here. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT +source "lib/Kconfig.RT" + +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool +# depends on !PREEMPT_RT + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. config X86_UP_APIC bool "Local APIC support on uniprocessors" @@ -911,7 +895,7 @@ config REGPARM bool "Use register arguments (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on EXPERIMENTAL && !MCOUNT default n help Compile the kernel with -mregparm=3. This uses a different ABI Index: 2.6-8xx/arch/i386/kernel/io_apic.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/io_apic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/io_apic.c 2005-06-16 13:45:08.000000000 -0300 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -45,7 +46,7 @@ int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; -static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); /* * Is the SiS APIC rmw bug present ? @@ -127,6 +128,105 @@ } } +#define IOAPIC_CACHE + +#ifdef IOAPIC_CACHE +# define MAX_IOAPIC_CACHE 512 + +/* + * Cache register values: + */ +static unsigned int io_apic_cache[MAX_IO_APICS][MAX_IOAPIC_CACHE] + ____cacheline_aligned_in_smp; +#endif + +inline unsigned int __raw_io_apic_read(unsigned int apic, unsigned int reg) +{ + *IO_APIC_BASE(apic) = reg; + return *(IO_APIC_BASE(apic)+4); +} + +unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg) +{ + unsigned int val = __raw_io_apic_read(apic, reg); + +#ifdef IOAPIC_CACHE + io_apic_cache[apic][reg] = val; +#endif + return val; +} + +unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ +#ifdef IOAPIC_CACHE + if (unlikely(reg >= MAX_IOAPIC_CACHE)) { + static int once = 1; + + if (once) { + once = 0; + printk("WARNING: ioapic register cache overflow: %d.\n", + reg); + dump_stack(); + } + return __raw_io_apic_read(apic, reg); + } + if (io_apic_cache[apic][reg] && !sis_apic_bug) + return io_apic_cache[apic][reg]; +#endif + return raw_io_apic_read(apic, reg); +} + +void io_apic_write(unsigned int apic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + if (unlikely(reg >= MAX_IOAPIC_CACHE)) { + static int once = 1; + + if (once) { + once = 0; + printk("WARNING: ioapic register cache overflow: %d.\n", + reg); + dump_stack(); + } + } else + io_apic_cache[apic][reg] = val; +#endif + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = val; +} + +/* + * Some systems need a POST flush or else level-triggered interrupts + * generate lots of spurious interrupts due to the POST-ed write not + * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC. + */ +#ifdef CONFIG_SMP +# define IOAPIC_POSTFLUSH +#endif + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index regiser + */ +void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + io_apic_cache[apic][reg] = val; +#endif + if (unlikely(sis_apic_bug)) + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = val; +#ifndef IOAPIC_POSTFLUSH + if (unlikely(sis_apic_bug)) +#endif + /* + * Force POST flush by reading: + */ + val = *(IO_APIC_BASE(apic)+4); +} + static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; @@ -158,18 +258,6 @@ __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -1405,8 +1493,8 @@ struct IO_APIC_route_entry entry; spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + *(((int *)&entry)+0) = raw_io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = raw_io_apic_read(apic, 0x11+i*2); spin_unlock_irqrestore(&ioapic_lock, flags); printk(KERN_DEBUG " %02x %03X %02X ", @@ -1565,7 +1653,6 @@ void /*__init*/ print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1767,7 +1854,7 @@ { unsigned long t1 = jiffies; - local_irq_enable(); + raw_local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); @@ -1858,6 +1945,31 @@ return 0; /* don't check for pending */ } +/* + * Level-triggered interrupt handling is different for RT kernels. + * + * In the RT case mask the IRQ first, then ack it, redirect it, + * and the IRQ thread then will handle it (sometime later) and will + * unmask it. + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + ack_APIC_irq(); +} + +static void end_level_ioapic_irq(unsigned int irq) +{ +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ +} + static void end_level_ioapic_irq (unsigned int irq) { unsigned long v; @@ -1885,17 +1997,30 @@ */ i = IO_APIC_VECTOR(irq); - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - +//#define APIC_BUG +#ifdef APIC_BUG + v = raw_apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +#endif ack_APIC_irq(); +#ifdef APIC_BUG if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } +#endif +} + +#endif /* !CONFIG_PREEMPT_HARDIRQS */ + +static void enable_level_ioapic_irq(unsigned int irq) +{ + unmask_IO_APIC_irq(irq); } #ifdef CONFIG_PCI_MSI @@ -1920,11 +2045,27 @@ return startup_level_ioapic_irq (irq); } +static void mask_and_ack_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_and_ack_level_ioapic_irq(irq); +} + static void end_level_ioapic_vector (unsigned int vector) { int irq = vector_to_irq(vector); - end_level_ioapic_irq(irq); +#ifdef CONFIG_PREEMPT_HARDIRQS + if (!(irq_desc[vector].status & (IRQ_DISABLED | IRQ_INPROGRESS)) && + irq_desc[vector].action) +#endif + end_level_ioapic_irq(irq); +} + +static void enable_level_ioapic_vector(unsigned int vector) +{ + enable_level_ioapic_irq(vector_to_irq(vector)); } static void mask_IO_APIC_vector (unsigned int vector) @@ -2157,7 +2298,10 @@ */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - timer_ack = 1; +#ifdef CONFIG_PREEMPT_RT + if (nmi_watchdog) +#endif + timer_ack = 1; enable_8259A_irq(0); pin1 = find_isa_irq_pin(0, mp_INT); Index: 2.6-8xx/arch/i386/kernel/apic.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/apic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/apic.c 2005-06-16 13:45:08.000000000 -0300 @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -523,9 +524,9 @@ if (!cpu_has_apic || !enabled_via_apicbase) return; - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } #ifdef CONFIG_PM @@ -569,9 +570,9 @@ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_irq_save(flags); + raw_local_irq_save(flags); disable_local_APIC(); - local_irq_restore(flags); + raw_local_irq_restore(flags); return 0; } @@ -583,7 +584,7 @@ if (!apic_pm_state.active) return 0; - local_irq_save(flags); + raw_local_irq_save(flags); /* * Make sure the APICBASE points to the right address @@ -614,7 +615,7 @@ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - local_irq_restore(flags); + raw_local_irq_restore(flags); return 0; } @@ -857,7 +858,6 @@ */ static unsigned int __init get_8254_timer_count(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; @@ -934,7 +934,7 @@ { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); /* * Wait for IRQ0's slice: @@ -943,7 +943,7 @@ __setup_APIC_LVTT(clocks); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -1032,7 +1032,7 @@ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); using_apic_timer = 1; - local_irq_disable(); + raw_local_irq_disable(); calibration_result = calibrate_APIC_clock(); /* @@ -1040,7 +1040,7 @@ */ setup_APIC_timer(calibration_result); - local_irq_enable(); + raw_local_irq_enable(); } void __init setup_secondary_APIC_clock(void) @@ -1113,6 +1113,7 @@ int cpu = smp_processor_id(); profile_tick(CPU_PROFILING, regs); + if (--per_cpu(prof_counter, cpu) <= 0) { /* * The multiplier may have changed since the last time we got @@ -1158,7 +1159,7 @@ * interrupt as well. Thus we cannot inline the local irq ... ] */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); @@ -1167,6 +1168,8 @@ */ per_cpu(irq_stat, cpu).apic_timer_irqs++; + trace_special(regs->eip, 0, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. Index: 2.6-8xx/arch/i386/kernel/i386_ksyms.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/i386_ksyms.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/i386_ksyms.c 2005-06-16 13:45:08.000000000 -0300 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,6 @@ #include extern void dump_thread(struct pt_regs *, struct user *); -extern spinlock_t rtc_lock; /* This is definitely a GPL-only symbol */ EXPORT_SYMBOL_GPL(cpu_gdt_table); @@ -81,10 +81,12 @@ EXPORT_SYMBOL(cpu_khz); EXPORT_SYMBOL(apm_info); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ @@ -140,8 +142,10 @@ EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); EXPORT_SYMBOL(cpu_callout_map); +#ifdef CONFIG_ASM_SEMAPHORES EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); +#endif /* Global SMP stuff */ EXPORT_SYMBOL(smp_call_function); @@ -171,17 +175,19 @@ EXPORT_SYMBOL(register_die_notifier); #ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock); #endif EXPORT_SYMBOL(__PAGE_KERNEL); #ifdef CONFIG_HIGHMEM EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_to_page); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic_to_page); #endif #if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) Index: 2.6-8xx/arch/i386/kernel/semaphore.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/semaphore.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/semaphore.c 2005-06-16 13:45:08.000000000 -0300 @@ -16,6 +16,7 @@ #include #include #include +#include #include /* @@ -49,12 +50,12 @@ * we cannot lose wakeup events. */ -static fastcall void __attribute_used__ __up(struct semaphore *sem) +static fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } -static fastcall void __attribute_used__ __sched __down(struct semaphore * sem) +static fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -91,7 +92,7 @@ tsk->state = TASK_RUNNING; } -static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem) +static fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -154,7 +155,7 @@ * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem) +static fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -190,15 +191,15 @@ asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" +".globl __compat_down_failed\n" +"__compat_down_failed:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down\n\t" + "call __compat_down\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -211,15 +212,15 @@ asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" +".globl __compat_down_failed_interruptible\n" +"__compat_down_failed_interruptible:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_interruptible\n\t" + "call __compat_down_interruptible\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -232,15 +233,15 @@ asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" +".globl __compat_down_failed_trylock\n" +"__compat_down_failed_trylock:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_trylock\n\t" + "call __compat_down_trylock\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -253,45 +254,20 @@ asm( ".section .sched.text\n" ".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" +".globl __compat_up_wakeup\n" +"__compat_up_wakeup:\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" - "call __up\n\t" + "call __compat_up\n\t" "popl %ecx\n\t" "popl %edx\n\t" "ret" ); -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif Index: 2.6-8xx/arch/i386/kernel/smpboot.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/smpboot.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/smpboot.c 2005-06-16 13:45:08.000000000 -0300 @@ -440,7 +440,7 @@ cpu_set(smp_processor_id(), cpu_online_map); /* We can take interrupts now: we're officially "up". */ - local_irq_enable(); + raw_local_irq_enable(); wmb(); cpu_idle(); @@ -1120,17 +1120,17 @@ { /* This only works at boot for x86. See "rewrite" above. */ if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); + raw_local_irq_enable(); return -ENOSYS; } /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { - local_irq_enable(); + raw_local_irq_enable(); return -EIO; } - local_irq_enable(); + raw_local_irq_enable(); /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) Index: 2.6-8xx/arch/i386/kernel/reboot.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/reboot.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/reboot.c 2005-06-16 13:45:08.000000000 -0300 @@ -218,7 +218,7 @@ { unsigned long flags; - local_irq_disable(); + raw_local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST routine will recognize as telling it to do a proper reboot. (Well Index: 2.6-8xx/arch/i386/kernel/timers/timer_hpet.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/timers/timer_hpet.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/timers/timer_hpet.c 2005-06-16 13:45:08.000000000 -0300 @@ -24,7 +24,7 @@ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); /* convert from cycles(64bits) => nanoseconds (64bits) * basic equation: Index: 2.6-8xx/arch/i386/kernel/timers/timer_tsc.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/timers/timer_tsc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/timers/timer_tsc.c 2005-06-16 13:45:08.000000000 -0300 @@ -24,6 +24,7 @@ #include "mach_timer.h" #include +#include #ifdef CONFIG_HPET_TIMER static unsigned long hpet_usec_quotient; @@ -35,8 +36,6 @@ int tsc_disable __initdata = 0; -extern spinlock_t i8253_lock; - static int use_tsc; /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; @@ -44,7 +43,7 @@ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); /* convert from cycles(64bits) => nanoseconds (64bits) * basic equation: @@ -171,9 +170,9 @@ static void mark_offset_tsc_hpet(void) { unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; + unsigned long offset, temp, hpet_current, flags; - write_seqlock(&monotonic_lock); + write_seqlock_irqsave(&monotonic_lock, flags); last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; /* * It is important that these two operations happen almost at @@ -201,7 +200,7 @@ /* update the monotonic base value */ this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); + write_sequnlock_irqrestore(&monotonic_lock, flags); /* calculate delay_at_last_interrupt */ /* @@ -342,7 +341,10 @@ static void mark_offset_tsc(void) { - unsigned long lost,delay; + unsigned long lost,delay, flags; +#ifndef CONFIG_PREEMPT_RT + unsigned long flags2; +#endif unsigned long delta = last_tsc_low; int count; int countmp; @@ -350,7 +352,7 @@ unsigned long long this_offset, last_offset; static int lost_count = 0; - write_seqlock(&monotonic_lock); + write_seqlock_irqsave(&monotonic_lock, flags); last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; /* * It is important that these two operations happen almost at @@ -368,24 +370,34 @@ rdtsc(last_tsc_low, last_tsc_high); - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ +#ifdef CONFIG_PREEMPT_RT + /* + * On PREEMPT_RT the timer IRQ never gets delayed by + * other interrupts, so we dont have to read the + * count: + */ + count = LATCH - 2; +#else + spin_lock_irqsave(&i8253_lock, flags2); + outb(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb(PIT_CH0); /* read the latched count */ count |= inb(PIT_CH0) << 8; +# undef VIA686A_WORKAROUND /* * VIA686a test code... reset the latch if count > max + 1 * from timer_pit.c - cjb */ +# ifdef VIA686A_WORKAROUND if (count > LATCH) { outb_p(0x34, PIT_MODE); outb_p(LATCH & 0xff, PIT_CH0); outb(LATCH >> 8, PIT_CH0); count = LATCH - 1; } - - spin_unlock(&i8253_lock); +# endif + spin_unlock_irqrestore(&i8253_lock, flags2); +#endif /* PREEMPT_RT */ if (pit_latch_buggy) { /* get center value of last 3 time lutch */ @@ -401,6 +413,11 @@ } } + if (panic_timeout > 0) { + panic_timeout--; + printk("LATCH: %d, count: %d, count1: %d, count2: %d\n", LATCH, count, count1, count2); + } + /* lost tick compensation */ delta = last_tsc_low - delta; { @@ -438,7 +455,7 @@ /* update the monotonic base value */ this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); + write_sequnlock_irqrestore(&monotonic_lock, flags); /* calculate delay_at_last_interrupt */ count = ((LATCH-1) - count) * TICK_SIZE; Index: 2.6-8xx/arch/i386/kernel/timers/timer_pm.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/timers/timer_pm.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/timers/timer_pm.c 2005-06-16 13:45:08.000000000 -0300 @@ -41,7 +41,7 @@ static u32 offset_delay; static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ Index: 2.6-8xx/arch/i386/kernel/timers/timer_cyclone.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/timers/timer_cyclone.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/timers/timer_cyclone.c 2005-06-16 13:45:08.000000000 -0300 @@ -17,9 +17,9 @@ #include #include #include -#include "io_ports.h" +#include -extern spinlock_t i8253_lock; +#include "io_ports.h" /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; @@ -36,7 +36,7 @@ static u32 last_cyclone_low; static u32 last_cyclone_high; static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); /* helper macro to atomically read both cyclone counter registers */ #define read_cyclone_counter(low,high) \ Index: 2.6-8xx/arch/i386/kernel/timers/timer_pit.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/timers/timer_pit.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/timers/timer_pit.c 2005-06-16 13:45:08.000000000 -0300 @@ -15,9 +15,8 @@ #include #include #include +#include -extern spinlock_t i8259A_lock; -extern spinlock_t i8253_lock; #include "do_timer.h" #include "io_ports.h" @@ -166,7 +165,6 @@ void setup_pit_timer(void) { - extern spinlock_t i8253_lock; unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); Index: 2.6-8xx/arch/i386/kernel/time.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/time.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/time.c 2005-06-16 13:45:08.000000000 -0300 @@ -68,7 +68,8 @@ #include "io_ports.h" -extern spinlock_t i8259A_lock; +#include + int pit_latch_buggy; /* extern */ #include "do_timer.h" @@ -81,9 +82,11 @@ extern unsigned long wall_jiffies; -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); + +#include -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); struct timer_opts *cur_timer = &timer_none; @@ -231,7 +234,7 @@ EXPORT_SYMBOL(monotonic_clock); #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -252,17 +255,18 @@ { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { + unsigned long flags; /* * Subtle, when I/O APICs are used we have to ack timer IRQ * manually to reset the IRR bit for do_slow_gettimeoffset(). * This will also deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + spin_lock_irqsave(&i8259A_lock, flags); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + spin_unlock_irqrestore(&i8259A_lock, flags); } #endif @@ -305,6 +309,7 @@ do_timer_interrupt(irq, NULL, regs); write_sequnlock(&xtime_lock); + return IRQ_HANDLED; } @@ -326,8 +331,7 @@ } static void sync_cmos_clock(unsigned long dummy); -static struct timer_list sync_cmos_timer = - TIMER_INITIALIZER(sync_cmos_clock, 0, 0); +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); static void sync_cmos_clock(unsigned long dummy) { @@ -405,6 +409,7 @@ write_sequnlock_irqrestore(&xtime_lock, flags); jiffies += sleep_length; wall_jiffies += sleep_length; + touch_softlockup_watchdog(); return 0; } Index: 2.6-8xx/arch/i386/kernel/init_task.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/init_task.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/init_task.c 2005-06-16 13:45:08.000000000 -0300 @@ -11,7 +11,7 @@ #include static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; +static struct files_struct init_files = INIT_FILES(init_files); static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); Index: 2.6-8xx/arch/i386/kernel/apm.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/apm.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/apm.c 2005-06-16 13:45:08.000000000 -0300 @@ -228,10 +228,10 @@ #include #include #include +#include #include "io_ports.h" -extern spinlock_t i8253_lock; extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); @@ -552,9 +552,9 @@ */ #define APM_DO_CLI \ if (apm_info.allow_ints) \ - local_irq_enable(); \ + raw_local_irq_enable(); \ else \ - local_irq_disable(); + raw_local_irq_disable(); #ifdef APM_ZERO_SEGS # define APM_DECL_SEGS \ @@ -609,7 +609,7 @@ APM_DO_SAVE_SEGS; apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); APM_DO_RESTORE_SEGS; - local_irq_restore(flags); + raw_local_irq_restore(flags); per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); @@ -652,7 +652,7 @@ APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); APM_DO_RESTORE_SEGS; - local_irq_restore(flags); + raw_local_irq_restore(flags); __get_cpu_var(cpu_gdt_table)[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); @@ -1168,8 +1168,7 @@ static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND - unsigned long flags; - extern spinlock_t i8253_lock; + unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); /* set the clock to 100 Hz */ @@ -1202,7 +1201,7 @@ } device_suspend(PMSG_SUSPEND); - local_irq_disable(); + raw_local_irq_disable(); device_power_down(PMSG_SUSPEND); /* serialize with the timer interrupt */ @@ -1218,13 +1217,13 @@ */ spin_unlock(&i8253_lock); write_sequnlock(&xtime_lock); - local_irq_enable(); + raw_local_irq_enable(); save_processor_state(); err = set_system_power_state(APM_STATE_SUSPEND); restore_processor_state(); - local_irq_disable(); + raw_local_irq_disable(); write_seqlock(&xtime_lock); spin_lock(&i8253_lock); reinit_timer(); @@ -1240,7 +1239,7 @@ apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; device_power_up(); - local_irq_enable(); + raw_local_irq_enable(); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(APM_NORMAL_RESUME, NULL); @@ -1259,22 +1258,22 @@ { int err; - local_irq_disable(); + raw_local_irq_disable(); device_power_down(PMSG_SUSPEND); /* serialize with the timer interrupt */ write_seqlock(&xtime_lock); /* If needed, notify drivers here */ get_time_diff(); write_sequnlock(&xtime_lock); - local_irq_enable(); + raw_local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) apm_error("standby", err); - local_irq_disable(); + raw_local_irq_disable(); device_power_up(); - local_irq_enable(); + raw_local_irq_enable(); } static apm_event_t get_event(void) Index: 2.6-8xx/arch/i386/kernel/cpu/mtrr/main.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/cpu/mtrr/main.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/cpu/mtrr/main.c 2005-06-16 13:45:08.000000000 -0300 @@ -146,7 +146,7 @@ struct set_mtrr_data *data = info; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); atomic_dec(&data->count); while(!atomic_read(&data->gate)) @@ -164,7 +164,7 @@ cpu_relax(); atomic_dec(&data->count); - local_irq_restore(flags); + raw_local_irq_restore(flags); } #endif @@ -225,7 +225,7 @@ if (smp_call_function(ipi_handler, &data, 1, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); - local_irq_save(flags); + raw_local_irq_save(flags); while(atomic_read(&data.count)) cpu_relax(); @@ -259,7 +259,7 @@ while(atomic_read(&data.count)) cpu_relax(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /** Index: 2.6-8xx/arch/i386/kernel/cpu/mtrr/generic.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/cpu/mtrr/generic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/cpu/mtrr/generic.c 2005-06-16 13:45:08.000000000 -0300 @@ -242,7 +242,7 @@ static unsigned long cr4 = 0; static u32 deftype_lo, deftype_hi; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they @@ -304,14 +304,14 @@ unsigned long mask, count; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prepare_set(); /* Actually set the state */ mask = set_mtrr_state(deftype_lo,deftype_hi); post_set(); - local_irq_restore(flags); + raw_local_irq_restore(flags); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { @@ -336,7 +336,7 @@ { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prepare_set(); if (size == 0) { @@ -351,7 +351,7 @@ } post_set(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) Index: 2.6-8xx/arch/i386/kernel/cpu/mtrr/cyrix.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/cpu/mtrr/cyrix.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/cpu/mtrr/cyrix.c 2005-06-16 13:45:08.000000000 -0300 @@ -17,7 +17,7 @@ arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ /* Save flags and disable interrupts */ - local_irq_save(flags); + raw_local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ @@ -28,7 +28,7 @@ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ /* Enable interrupts if it was enabled previously */ - local_irq_restore(flags); + raw_local_irq_restore(flags); shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; Index: 2.6-8xx/arch/i386/kernel/cpu/mtrr/state.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/cpu/mtrr/state.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/cpu/mtrr/state.c 2005-06-16 13:45:08.000000000 -0300 @@ -12,7 +12,7 @@ unsigned int cr0; /* Disable interrupts locally */ - local_irq_save(ctxt->flags); + raw_local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { @@ -73,6 +73,6 @@ write_cr4(ctxt->cr4val); } /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); + raw_local_irq_restore(ctxt->flags); } Index: 2.6-8xx/arch/i386/kernel/signal.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/signal.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/signal.c 2005-06-16 13:45:08.000000000 -0300 @@ -593,6 +593,13 @@ int signr; struct k_sigaction ka; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + raw_local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: 2.6-8xx/arch/i386/kernel/nmi.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/nmi.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/nmi.c 2005-06-16 13:45:08.000000000 -0300 @@ -36,7 +36,7 @@ unsigned int nmi_watchdog = NMI_NONE; extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); @@ -114,8 +114,8 @@ for (cpu = 0; cpu < NR_CPUS; cpu++) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; - local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks + raw_local_irq_enable(); + mdelay((100*1000)/nmi_hz); // wait 100 ticks for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_SMP @@ -136,7 +136,7 @@ /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = 1; + nmi_hz = 1000; return 0; } @@ -339,8 +339,8 @@ | K7_NMI_EVENT; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); + Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz*1000/nmi_hz)); + wrmsr(MSR_K7_PERFCTR0, -(cpu_khz*1000/nmi_hz), -1); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); @@ -361,8 +361,8 @@ | P6_NMI_EVENT; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0); + Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz*1000/nmi_hz)); + wrmsr(MSR_P6_PERFCTR0, -(cpu_khz*1000/nmi_hz), 0); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); @@ -402,8 +402,8 @@ wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); + Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz*1000/nmi_hz)); + wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz*1000/nmi_hz), -1); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; @@ -472,11 +472,38 @@ */ for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); } extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +void notrace nmi_watchdog_tick (struct pt_regs * regs) { /* @@ -488,14 +515,38 @@ sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk("NMI show regs on CPU#%d:\n", cpu); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } + if (last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + bust_spinlocks(1); + spin_lock(&nmi_print_lock); + printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) + if (i != cpu) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; @@ -518,7 +569,7 @@ * other P6 variant */ apic_write(APIC_LVTPC, APIC_DM_NMI); } - wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); + wrmsr(nmi_perfctr_msr, -(cpu_khz*1000/nmi_hz), -1); } } Index: 2.6-8xx/arch/i386/kernel/entry.S =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/entry.S 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/entry.S 2005-06-16 13:45:08.000000000 -0300 @@ -76,10 +76,10 @@ VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop cli +# define preempt_stop cli #else -#define preempt_stop -#define resume_kernel restore_nocheck +# define preempt_stop +# define resume_kernel restore_nocheck #endif #define SAVE_ALL \ @@ -160,14 +160,17 @@ #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) cli + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + cli call preempt_schedule_irq jmp need_resched #endif @@ -200,6 +203,11 @@ pushl %eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -213,6 +221,11 @@ movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_LATENCY_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx @@ -225,6 +238,11 @@ ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -254,6 +272,17 @@ cmpl $((4 << 8) | 3), %eax je ldt_ss # returning to user-space with LDT SS restore_nocheck: +#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE) + pushl %eax +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + call trace_irqs_on +#endif +#ifdef CONFIG_LATENCY_TRACE + call sys_ret +#endif + popl %eax +#endif +restore_nocheck_nmi: RESTORE_REGS addl $4, %esp 1: iret @@ -297,18 +326,22 @@ # perform work that needs to be done immediately before resumption ALIGN work_pending: - testb $_TIF_NEED_RESCHED, %cl + testb $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %cl jz work_notifysig work_resched: - call schedule - cli # make sure we don't miss an interrupt + cli + call __schedule +#ifdef CONFIG_PREEMPT_RT + call local_irq_enable_noresched +#endif + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testb $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %cl jnz work_resched work_notifysig: # deal with pending signals and @@ -348,6 +381,11 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + pushl %eax + call trace_irqs_on + popl %eax +#endif sti # could let do_syscall_trace() call # schedule() instead movl %esp, %eax @@ -409,9 +447,16 @@ vector=vector+1 .endr +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# define TRACE_IRQS_OFF call trace_irqs_off_lowlevel; +#else +# define TRACE_IRQS_OFF +#endif + ALIGN common_interrupt: SAVE_ALL + TRACE_IRQS_OFF movl %esp,%eax call do_IRQ jmp ret_from_intr @@ -420,6 +465,7 @@ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ + TRACE_IRQS_OFF \ movl %esp,%eax; \ call smp_/**/name; \ jmp ret_from_intr; @@ -549,7 +595,7 @@ xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_all + jmp restore_nocheck_nmi nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) Index: 2.6-8xx/arch/i386/kernel/process.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/process.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/process.c 2005-06-16 13:45:08.000000000 -0300 @@ -96,12 +96,13 @@ void default_idle(void) { if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); + raw_local_irq_disable(); + if (!need_resched() && !need_resched_delayed()) + raw_safe_halt(); else - local_irq_enable(); + raw_local_irq_enable(); } else { + raw_local_irq_enable(); cpu_relax(); } } @@ -115,7 +116,7 @@ { int oldval; - local_irq_enable(); + raw_local_irq_enable(); /* * Deal with another CPU just having chosen a thread to @@ -130,7 +131,7 @@ "testl %0, %1;" "rep; nop;" "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + : : "i"(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), "m" (current_thread_info()->flags)); clear_thread_flag(TIF_POLLING_NRFLAG); } else { @@ -148,7 +149,9 @@ { /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(raw_irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -161,9 +164,13 @@ idle = default_idle; __get_cpu_var(irq_stat).idle_timestamp = jiffies; + stop_critical_timing(); + propagate_preempt_locks_value(); idle(); } - schedule(); + raw_local_irq_disable(); + __schedule(); + raw_local_irq_enable(); } } @@ -206,14 +213,14 @@ { local_irq_enable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { set_thread_flag(TIF_POLLING_NRFLAG); do { __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) + if (need_resched() || need_resched_delayed()) break; __mwait(0, 0); - } while (!need_resched()); + } while (!need_resched() && !need_resched_delayed()); clear_thread_flag(TIF_POLLING_NRFLAG); } } @@ -336,11 +343,16 @@ /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + int cpu; + struct tss_struct *tss; + void *io_bitmap_ptr = t->io_bitmap_ptr; - kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + mb(); + kfree(io_bitmap_ptr); + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); /* * Careful, clear this in the TSS too: */ Index: 2.6-8xx/arch/i386/kernel/Makefile =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -4,11 +4,12 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ +obj-y := process.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ doublefault.o quirks.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-y += cpu/ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ @@ -20,6 +21,7 @@ obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o Index: 2.6-8xx/arch/i386/kernel/traps.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/traps.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/traps.c 2005-06-16 13:45:08.000000000 -0300 @@ -93,7 +93,7 @@ static int kstack_depth_to_print = 24; struct notifier_block *i386die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); +static DEFINE_RAW_SPINLOCK(die_notifier_lock); int register_die_notifier(struct notifier_block *nb) { @@ -115,22 +115,27 @@ unsigned long *stack, unsigned long ebp) { unsigned long addr; +#ifndef CONFIG_FRAME_POINTER + unsigned long prev_frame; +#endif -#ifdef CONFIG_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { addr = *(unsigned long *)(ebp + 4); printk(" [<%08lx>] ", addr); print_symbol("%s", addr); - printk("\n"); + printk(" (%ld)\n", *(unsigned long *)ebp - ebp); ebp = *(unsigned long *)ebp; } #else + prev_frame = (unsigned long)stack; while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) { printk(" [<%08lx>]", addr); print_symbol(" %s", addr); - printk("\n"); + printk(" (%ld)\n", (unsigned long)stack - prev_frame); + prev_frame = (unsigned long)stack; } } #endif @@ -162,6 +167,7 @@ break; printk(" =======================\n"); } + print_traces(task); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -224,8 +230,8 @@ regs->eax, regs->ebx, regs->ecx, regs->edx); printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk("ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk("ds: %04x es: %04x ss: %04x preempt: %08x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count()); printk("Process %s (pid: %d, threadinfo=%p task=%p)", current->comm, current->pid, current_thread_info(), current); /* @@ -296,11 +302,11 @@ void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -369,6 +375,11 @@ if (!(regs->xcs & 3)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + raw_local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { struct task_struct *tsk = current; tsk->thread.error_code = error_code; @@ -497,7 +508,7 @@ return; gp_in_vm86: - local_irq_enable(); + raw_local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; @@ -551,10 +562,11 @@ printk("Do you have a strange power saving mode enabled?\n"); } -static DEFINE_SPINLOCK(nmi_print_lock); +static DEFINE_RAW_SPINLOCK(nmi_print_lock); void die_nmi (struct pt_regs *regs, const char *msg) { + deadlock_trace_off(); spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try @@ -569,17 +581,19 @@ console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); + nmi_exit(); do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static void notrace default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; /* Only the BSP gets external NMIs from the system. */ if (!smp_processor_id()) reason = get_nmi_reason(); - + +// trace_special(6, 0, 0); if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) == NOTIFY_STOP) @@ -591,6 +605,7 @@ */ if (nmi_watchdog) { nmi_watchdog_tick(regs); +// trace_special(6, 1, 0); return; } #endif @@ -610,18 +625,19 @@ reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall notrace void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); ++nmi_count(cpu); @@ -689,7 +705,7 @@ return; /* It's safe to allow irq's after DR6 has been saved */ if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); + raw_local_irq_enable(); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { Index: 2.6-8xx/arch/i386/kernel/i8259.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/i8259.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/i8259.c 2005-06-16 13:45:08.000000000 -0300 @@ -38,13 +38,15 @@ * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { +#ifndef CONFIG_PREEMPT_HARDIRQS if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && irq_desc[irq].action) enable_8259A_irq(irq); +#endif } #define shutdown_8259A_irq disable_8259A_irq @@ -191,6 +193,11 @@ */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; +#ifdef CONFIG_PREEMPT_HARDIRQS +handle_real_irq: + if (irq & 8) + outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ +#else cached_irq_mask |= irqmask; handle_real_irq: @@ -204,6 +211,7 @@ outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ } +#endif spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -304,15 +312,18 @@ outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ +#ifndef CONFIG_PREEMPT_HARDIRQS + if (!auto_eoi) /* master expects normal EOI */ outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ +#endif + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#ifndef CONFIG_PREEMPT_HARDIRQS if (auto_eoi) /* * in AEOI mode we just have to mask the interrupt @@ -320,6 +331,7 @@ */ i8259A_irq_type.ack = disable_8259A_irq; else +#endif i8259A_irq_type.ack = mask_and_ack_8259A; udelay(100); /* wait for 8259A to initialize */ @@ -357,7 +369,7 @@ * New motherboards sometimes make IRQ 13 be a PCI interrupt, * so allow interrupt sharing. */ -static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; +static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL }; void __init init_ISA_irqs (void) { Index: 2.6-8xx/arch/i386/kernel/smp.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/smp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/smp.c 2005-06-16 13:45:08.000000000 -0300 @@ -162,7 +162,7 @@ unsigned long cfg; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); /* * Wait for idle. @@ -185,7 +185,7 @@ */ apic_write_around(APIC_ICR, cfg); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void send_IPI_mask_sequence(cpumask_t mask, int vector) @@ -199,7 +199,7 @@ * should be modified to do 1 message per cluster ID - mbligh */ - local_irq_save(flags); + raw_local_irq_save(flags); for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { if (cpu_isset(query_cpu, mask)) { @@ -226,7 +226,7 @@ apic_write_around(APIC_ICR, cfg); } } - local_irq_restore(flags); + raw_local_irq_restore(flags); } #include /* must come after the send_IPI functions above for inlining */ @@ -244,7 +244,7 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -389,7 +389,7 @@ while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -478,10 +478,20 @@ } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -520,7 +530,7 @@ return 0; /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + WARN_ON(raw_irqs_disabled()); data.func = func; data.info = info; @@ -554,7 +564,7 @@ * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) __asm__("hlt"); @@ -569,19 +579,20 @@ { smp_call_function(stop_this_cpu, NULL, 1, 0); - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); + set_tsk_need_resched(current); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) Index: 2.6-8xx/arch/i386/kernel/irq.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/irq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/irq.c 2005-06-16 13:45:08.000000000 -0300 @@ -48,7 +48,7 @@ * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { /* high bits used in ret_from_ code */ int irq = regs->orig_eax & 0xff; @@ -58,6 +58,7 @@ #endif irq_enter(); + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -66,7 +67,7 @@ __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } @@ -234,6 +235,7 @@ for (action=action->next; action; action = action->next) seq_printf(p, ", %s", action->name); + seq_printf(p, " %d/%d", irq_desc[i].irqs_unhandled, irq_desc[i].irq_count); seq_putc(p, '\n'); skip: Index: 2.6-8xx/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/arch/i386/kernel/mcount-wrapper.S 2005-06-16 13:45:08.000000000 -0300 @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: 2.6-8xx/arch/i386/kernel/microcode.c =================================================================== --- 2.6-8xx.orig/arch/i386/kernel/microcode.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/kernel/microcode.c 2005-06-16 13:45:08.000000000 -0300 @@ -109,7 +109,7 @@ #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DECLARE_MUTEX(microcode_sem); Index: 2.6-8xx/arch/i386/mach-voyager/voyager_basic.c =================================================================== --- 2.6-8xx.orig/arch/i386/mach-voyager/voyager_basic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mach-voyager/voyager_basic.c 2005-06-16 13:45:08.000000000 -0300 @@ -30,6 +30,7 @@ #include #include #include +#include /* * Power off function, if any @@ -182,7 +183,6 @@ * and swiftly introduce it to something sharp and * pointy. */ __u16 val; - extern spinlock_t i8253_lock; spin_lock(&i8253_lock); Index: 2.6-8xx/arch/i386/mach-voyager/setup.c =================================================================== --- 2.6-8xx.orig/arch/i386/mach-voyager/setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mach-voyager/setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -17,7 +17,7 @@ /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init intr_init_hook(void) { @@ -40,7 +40,7 @@ { } -static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; void __init time_init_hook(void) { Index: 2.6-8xx/arch/i386/lib/dec_and_lock.c =================================================================== --- 2.6-8xx.orig/arch/i386/lib/dec_and_lock.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/lib/dec_and_lock.c 2005-06-16 13:45:08.000000000 -0300 @@ -10,7 +10,7 @@ #include #include -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock) { int counter; int newcount; @@ -32,9 +32,9 @@ return 0; slow_path: - spin_lock(lock); + _raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; - spin_unlock(lock); + _raw_spin_unlock(lock); return 0; } Index: 2.6-8xx/arch/i386/lib/bitops.c =================================================================== --- 2.6-8xx.orig/arch/i386/lib/bitops.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/lib/bitops.c 2005-06-16 13:45:08.000000000 -0300 @@ -68,3 +68,37 @@ return (offset + set + res); } EXPORT_SYMBOL(find_next_zero_bit); + + +/* + * rw spinlock fallbacks + */ +#if defined(CONFIG_SMP) +asm( +".section .sched.text\n" +".align 4\n" +".globl __write_lock_failed\n" +"__write_lock_failed:\n\t" + LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" +"1: rep; nop\n\t" + "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jne 1b\n\t" + LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jnz __write_lock_failed\n\t" + "ret" +); + +asm( +".section .sched.text\n" +".align 4\n" +".globl __read_lock_failed\n" +"__read_lock_failed:\n\t" + LOCK "incl (%eax)\n" +"1: rep; nop\n\t" + "cmpl $1,(%eax)\n\t" + "js 1b\n\t" + LOCK "decl (%eax)\n\t" + "js __read_lock_failed\n\t" + "ret" +); +#endif Index: 2.6-8xx/arch/i386/mach-visws/visws_apic.c =================================================================== --- 2.6-8xx.orig/arch/i386/mach-visws/visws_apic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mach-visws/visws_apic.c 2005-06-16 13:45:08.000000000 -0300 @@ -261,11 +261,13 @@ static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = SA_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = SA_NODELAY, }; Index: 2.6-8xx/arch/i386/mach-visws/setup.c =================================================================== --- 2.6-8xx.orig/arch/i386/mach-visws/setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mach-visws/setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -112,7 +112,7 @@ static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = SA_INTERRUPT, + .flags = SA_INTERRUPT | SA_NODELAY, .name = "timer", }; Index: 2.6-8xx/arch/i386/mach-default/setup.c =================================================================== --- 2.6-8xx.orig/arch/i386/mach-default/setup.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/i386/mach-default/setup.c 2005-06-16 13:45:08.000000000 -0300 @@ -27,7 +27,7 @@ /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; /** * intr_init_hook - post gate setup interrupt initialisation @@ -71,7 +71,7 @@ { } -static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; /** * time_init_hook - do any specific initialisations for the system timer. Index: 2.6-8xx/arch/m68k/mac/macboing.c =================================================================== --- 2.6-8xx.orig/arch/m68k/mac/macboing.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/m68k/mac/macboing.c 2005-06-16 13:45:08.000000000 -0300 @@ -56,8 +56,7 @@ /* * our timer to start/continue/stop the bell */ -static struct timer_list mac_sound_timer = - TIMER_INITIALIZER(mac_nosound, 0, 0); +static DEFINE_TIMER(mac_sound_timer, mac_nosound, 0, 0); /* * Sort of initialize the sound chip (called from mac_mksound on the first Index: 2.6-8xx/arch/m68k/amiga/amisound.c =================================================================== --- 2.6-8xx.orig/arch/m68k/amiga/amisound.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/arch/m68k/amiga/amisound.c 2005-06-16 13:45:08.000000000 -0300 @@ -63,7 +63,7 @@ } static void nosound( unsigned long ignored ); -static struct timer_list sound_timer = TIMER_INITIALIZER(nosound, 0, 0); +static DEFINE_TIMER(sound_timer, nosound, 0, 0); void amiga_mksound( unsigned int hz, unsigned int ticks ) { Index: 2.6-8xx/drivers/net/atari_bionet.c =================================================================== --- 2.6-8xx.orig/drivers/net/atari_bionet.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/atari_bionet.c 2005-06-16 13:45:08.000000000 -0300 @@ -155,7 +155,7 @@ static struct net_device_stats *net_get_stats(struct net_device *dev); static void bionet_tick(unsigned long); -static struct timer_list bionet_timer = TIMER_INITIALIZER(bionet_tick, 0, 0); +static DEFINE_TIMER(bionet_timer, bionet_tick, 0, 0); #define STRAM_ADDR(a) (((a) & 0xff000000) == 0) Index: 2.6-8xx/drivers/net/tg3.c =================================================================== --- 2.6-8xx.orig/drivers/net/tg3.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/tg3.c 2005-06-16 13:45:08.000000000 -0300 @@ -3247,9 +3247,9 @@ * So we really do need to disable interrupts when taking * tx_lock here. */ - local_irq_save(flags); + local_irq_save_nort(flags); if (!spin_trylock(&tp->tx_lock)) { - local_irq_restore(flags); + local_irq_restore_nort(flags); return NETDEV_TX_LOCKED; } Index: 2.6-8xx/drivers/net/tulip/tulip_core.c =================================================================== --- 2.6-8xx.orig/drivers/net/tulip/tulip_core.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/tulip/tulip_core.c 2005-06-16 13:45:08.000000000 -0300 @@ -1802,6 +1802,7 @@ pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ Index: 2.6-8xx/drivers/net/atari_pamsnet.c =================================================================== --- 2.6-8xx.orig/drivers/net/atari_pamsnet.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/atari_pamsnet.c 2005-06-16 13:45:08.000000000 -0300 @@ -165,7 +165,7 @@ static irqreturn_t pamsnet_intr(int irq, void *data, struct pt_regs *fp); -static struct timer_list pamsnet_timer = TIMER_INITIALIZER(pamsnet_tick, 0, 0); +static DEFINE_TIMER(pamsnet_timer, pamsnet_tick, 0, 0); #define STRAM_ADDR(a) (((a) & 0xff000000) == 0) Index: 2.6-8xx/drivers/net/mv643xx_eth.c =================================================================== --- 2.6-8xx.orig/drivers/net/mv643xx_eth.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/mv643xx_eth.c 2005-06-16 13:45:08.000000000 -0300 @@ -95,7 +95,7 @@ static void __iomem *mv643xx_eth_shared_base; /* used to protect MV643XX_ETH_SMI_REG, which is shared across ports */ -static spinlock_t mv643xx_eth_phy_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(mv643xx_eth_phy_lock); static inline u32 mv_read(int offset) { Index: 2.6-8xx/drivers/net/hamradio/yam.c =================================================================== --- 2.6-8xx.orig/drivers/net/hamradio/yam.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/hamradio/yam.c 2005-06-16 13:45:08.000000000 -0300 @@ -170,7 +170,7 @@ static char ax25_test[7] = {'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, '1' << 1}; -static struct timer_list yam_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(yam_timer, NULL, 0, 0); /* --------------------------------------------------------------------- */ Index: 2.6-8xx/drivers/net/ppp_synctty.c =================================================================== --- 2.6-8xx.orig/drivers/net/ppp_synctty.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/ppp_synctty.c 2005-06-16 13:45:08.000000000 -0300 @@ -70,7 +70,7 @@ struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ }; Index: 2.6-8xx/drivers/net/e1000/e1000_main.c =================================================================== --- 2.6-8xx.orig/drivers/net/e1000/e1000_main.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/e1000/e1000_main.c 2005-06-16 13:45:08.000000000 -0300 @@ -2270,10 +2270,10 @@ if(adapter->pcix_82544) count += nr_frags; - local_irq_save(flags); + local_irq_save_nort(flags); if (!spin_trylock(&adapter->tx_lock)) { /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); + local_irq_restore_nort(flags); return NETDEV_TX_LOCKED; } if(adapter->hw.tx_pkt_filtering && (adapter->hw.mac_type == e1000_82573) ) Index: 2.6-8xx/drivers/net/3c59x.c =================================================================== --- 2.6-8xx.orig/drivers/net/3c59x.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/3c59x.c 2005-06-16 13:45:08.000000000 -0300 @@ -956,9 +956,13 @@ struct vortex_private *vp = netdev_priv(dev); unsigned long flags; local_save_flags(flags); +#ifndef CONFIG_PREEMPT_RT local_irq_disable(); +#endif (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } #endif @@ -1987,13 +1991,17 @@ /* * Block interrupts because vortex_interrupt does a bare spin_lock() */ +#ifndef CONFIG_PREEMPT_RT unsigned long flags; local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev, NULL); else vortex_interrupt(dev->irq, dev, NULL); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } Index: 2.6-8xx/drivers/net/cris/eth_v10.c =================================================================== --- 2.6-8xx.orig/drivers/net/cris/eth_v10.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/cris/eth_v10.c 2005-06-16 13:45:08.000000000 -0300 @@ -384,8 +384,8 @@ static unsigned int network_tr_ctrl_shadow = 0; /* Network speed indication. */ -static struct timer_list speed_timer = TIMER_INITIALIZER(NULL, 0, 0); -static struct timer_list clear_led_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(speed_timer, NULL, 0, 0); +static DEFINE_TIMER(clear_led_timer, NULL, 0, 0); static int current_speed; /* Speed read from transceiver */ static int current_speed_selection; /* Speed selected by user */ static unsigned long led_next_time; @@ -393,7 +393,7 @@ static int rx_queue_len; /* Duplex */ -static struct timer_list duplex_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(duplex_timer, NULL, 0, 0); static int full_duplex; static enum duplex current_duplex; Index: 2.6-8xx/drivers/net/netconsole.c =================================================================== --- 2.6-8xx.orig/drivers/net/netconsole.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/netconsole.c 2005-06-16 13:45:08.000000000 -0300 @@ -75,10 +75,19 @@ return; local_irq_save(flags); +#ifdef CONFIG_PREEMPT_RT + /* + * A bit hairy. Netconsole uses mutexes (indirectly) and + * thus must have interrupts enabled: + */ + local_irq_enable(); +#endif for(left = len; left; ) { frag = min(left, MAX_PRINT_CHUNK); + WARN_ON_RT(irqs_disabled()); netpoll_send_udp(&np, msg, frag); + WARN_ON_RT(irqs_disabled()); msg += frag; left -= frag; } Index: 2.6-8xx/drivers/net/ppp_async.c =================================================================== --- 2.6-8xx.orig/drivers/net/ppp_async.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/net/ppp_async.c 2005-06-16 13:45:08.000000000 -0300 @@ -65,7 +65,7 @@ struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; Index: 2.6-8xx/drivers/char/tpm/tpm_nsc.c =================================================================== --- 2.6-8xx.orig/drivers/char/tpm/tpm_nsc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/tpm/tpm_nsc.c 2005-06-16 13:45:08.000000000 -0300 @@ -56,8 +56,7 @@ static int wait_for_stat(struct tpm_chip *chip, u8 mask, u8 val, u8 * data) { int expired = 0; - struct timer_list status_timer = - TIMER_INITIALIZER(tpm_time_expired, jiffies + 10 * HZ, + DEFINE_TIMER(status_timer, tpm_time_expired, jiffies + 10 * HZ, (unsigned long) &expired); /* status immediately available check */ @@ -85,8 +84,7 @@ { int status; int expired = 0; - struct timer_list status_timer = - TIMER_INITIALIZER(tpm_time_expired, jiffies + 100, + DEFINE_TIMER(status_timer, tpm_time_expired, jiffies + 100, (unsigned long) &expired); /* status immediately available check */ Index: 2.6-8xx/drivers/char/watchdog/mixcomwd.c =================================================================== --- 2.6-8xx.orig/drivers/char/watchdog/mixcomwd.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/watchdog/mixcomwd.c 2005-06-16 13:45:08.000000000 -0300 @@ -59,7 +59,7 @@ static int watchdog_port; static int mixcomwd_timer_alive; -static struct timer_list mixcomwd_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(mixcomwd_timer, NULL, 0, 0); static char expect_close; #ifdef CONFIG_WATCHDOG_NOWAYOUT Index: 2.6-8xx/drivers/char/watchdog/softdog.c =================================================================== --- 2.6-8xx.orig/drivers/char/watchdog/softdog.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/watchdog/softdog.c 2005-06-16 13:45:08.000000000 -0300 @@ -80,8 +80,7 @@ static void watchdog_fire(unsigned long); -static struct timer_list watchdog_ticktock = - TIMER_INITIALIZER(watchdog_fire, 0, 0); +static DEFINE_TIMER(watchdog_ticktock, watchdog_fire, 0, 0); static unsigned long timer_alive; static char expect_close; Index: 2.6-8xx/drivers/char/Kconfig =================================================================== --- 2.6-8xx.orig/drivers/char/Kconfig 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/Kconfig 2005-06-16 13:45:08.000000000 -0300 @@ -711,6 +711,45 @@ To compile this driver as a module, choose M here: the module will be called rtc. +config RTC_HISTOGRAM + tristate "Real Time Clock Histogram Support" + default y + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 Index: 2.6-8xx/drivers/char/vt.c =================================================================== --- 2.6-8xx.orig/drivers/char/vt.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/vt.c 2005-06-16 13:45:08.000000000 -0300 @@ -2159,6 +2159,13 @@ if (vc->vc_mode != KD_TEXT) goto quit; + /* + * Skip kernel message from within a critical section going + * to a preemptible console (such as fbcon). + */ + if (in_atomic_rt() && vc->vc_sw->con_preemptible) + goto quit; + /* undraw cursor first */ if (IS_FG(vc)) hide_cursor(vc); @@ -2782,8 +2789,8 @@ } hide_cursor(vc); - del_timer_sync(&console_timer); - blank_timer_expired = 0; +// del_timer_sync(&console_timer); +// blank_timer_expired = 0; save_screen(vc); /* In case we need to reset origin, blanking hook returns 1 */ @@ -2796,8 +2803,8 @@ return; if (vesa_off_interval) { - blank_state = blank_vesa_wait, - mod_timer(&console_timer, jiffies + vesa_off_interval); + blank_state = blank_vesa_wait; +// mod_timer(&console_timer, jiffies + vesa_off_interval); } if (vesa_blank_mode) @@ -2834,8 +2841,11 @@ return; /* but leave console_blanked != 0 */ if (blankinterval) { - mod_timer(&console_timer, jiffies + blankinterval); - blank_state = blank_normal_wait; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); +#endif +// mod_timer(&console_timer, jiffies + blankinterval); +// blank_state = blank_normal_wait; } console_blanked = 0; @@ -2886,16 +2896,16 @@ /* This isn't perfectly race free, but a race here would be mostly harmless, * at worse, we'll do a spurrious blank and it's unlikely */ - del_timer(&console_timer); - blank_timer_expired = 0; +// del_timer(&console_timer); +// blank_timer_expired = 0; if (ignore_poke || !vc_cons[fg_console].d || vc_cons[fg_console].d->vc_mode == KD_GRAPHICS) return; if (console_blanked) unblank_screen(); else if (blankinterval) { - mod_timer(&console_timer, jiffies + blankinterval); - blank_state = blank_normal_wait; +// mod_timer(&console_timer, jiffies + blankinterval); +// blank_state = blank_normal_wait; } } Index: 2.6-8xx/drivers/char/blocker.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/drivers/char/blocker.c 2005-06-16 14:37:30.000000000 -0300 @@ -0,0 +1,118 @@ +/* + * priority inheritance testing device + */ + +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define MAX_LOCK_DEPTH 10 + +u64 notrace get_cpu_tick(void) +{ + u64 tsc; +#ifdef ARCHARM + tsc = *oscr; +#else + __asm__ __volatile__("rdtsc" : "=A" (tsc)); +#endif + return tsc; +} + +void notrace loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cpu_tick(); +} + +static spinlock_t blocker_lock[MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: 2.6-8xx/drivers/char/istallion.c =================================================================== --- 2.6-8xx.orig/drivers/char/istallion.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/istallion.c 2005-06-16 13:45:08.000000000 -0300 @@ -781,7 +781,7 @@ * much cheaper on host cpu than using interrupts. It turns out to * not increase character latency by much either... */ -static struct timer_list stli_timerlist = TIMER_INITIALIZER(stli_poll, 0, 0); +static DEFINE_TIMER(stli_timerlist, stli_poll, 0, 0); static int stli_timeron; Index: 2.6-8xx/drivers/char/specialix.c =================================================================== --- 2.6-8xx.orig/drivers/char/specialix.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/specialix.c 2005-06-16 13:45:08.000000000 -0300 @@ -2491,7 +2491,7 @@ #endif for (i = 0; i < SX_NBOARD; i++) - sx_board[i].lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&sx_board[i].lock); if (sx_init_drivers()) { func_exit(); Index: 2.6-8xx/drivers/char/lpptest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6-8xx/drivers/char/lpptest.c 2005-06-16 13:45:08.000000000 -0300 @@ -0,0 +1,162 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + raw_local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + rdtscll(now); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + raw_local_irq_enable(); + + return 0; + } + } + rdtscll(end); + outb(0x00, 0x378); + raw_local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= SA_NODELAY | SA_INTERRUPT; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: 2.6-8xx/drivers/char/tty_io.c =================================================================== --- 2.6-8xx.orig/drivers/char/tty_io.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/tty_io.c 2005-06-16 13:45:08.000000000 -0300 @@ -224,6 +224,7 @@ printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif @@ -827,8 +828,8 @@ p->signal->tty = NULL; if (!p->signal->leader) continue; - send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p); - send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); + group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); + group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); if (tty->pgrp > 0) p->signal->tty_old_pgrp = tty->pgrp; } while_each_task_pid(tty->session, PIDTYPE_SID, p); Index: 2.6-8xx/drivers/char/sysrq.c =================================================================== --- 2.6-8xx.orig/drivers/char/sysrq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/sysrq.c 2005-06-16 13:45:08.000000000 -0300 @@ -153,6 +153,38 @@ .enable_mask = SYSRQ_ENABLE_DUMP, }; +#ifdef CONFIG_RT_DEADLOCK_DETECT + +static void sysrq_handle_showlocks(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + show_all_locks(); +} + +static struct sysrq_key_op sysrq_showlocks_op = { + .handler = sysrq_handle_showlocks, + .help_msg = "show-all-locks(D)", + .action_msg = "Show Locks Held", +}; + +#endif + +#if defined(__i386__) + +static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; + +#endif + static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs, struct tty_struct *tty) @@ -274,7 +306,11 @@ and will never arrive */ /* b */ &sysrq_reboot_op, /* c */ NULL, +#ifdef CONFIG_RT_DEADLOCK_DETECT +/* d */ &sysrq_showlocks_op, +#else /* d */ NULL, +#endif /* e */ &sysrq_term_op, /* f */ &sysrq_moom_op, /* g */ NULL, @@ -286,7 +322,11 @@ #else /* k */ NULL, #endif +#if defined(__i386__) +/* l */ &sysrq_showallregs_op, +#else /* l */ NULL, +#endif /* m */ &sysrq_showmem_op, /* n */ &sysrq_unrt_op, /* o */ NULL, /* This will often be registered Index: 2.6-8xx/drivers/char/rtc.c =================================================================== --- 2.6-8xx.orig/drivers/char/rtc.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/rtc.c 2005-06-16 13:45:08.000000000 -0300 @@ -78,6 +78,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,28 @@ #include #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef __sparc__ #include #include @@ -204,7 +227,147 @@ return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + rtc_state = S_READ_MISSED; + printk("`%s'[%d] is being piggy. need_resched=%d, cpu=%d\n", + current->comm, current->pid, + need_resched(), smp_processor_id()); + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with SA_INTERRUPT set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -217,6 +380,8 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) { + int mod; + /* * Can be an alarm interrupt, update complete interrupt, * or a periodic interrupt. We store the status in the @@ -238,19 +403,22 @@ rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0); } + mod = 0; if (rtc_status & RTC_TIMER_ON) - mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); + mod = 1; spin_unlock (&rtc_lock); + if (mod) + mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); /* Now do the rest of the actions */ spin_lock(&rtc_task_lock); if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -354,6 +522,8 @@ schedule(); } while (1); + rtc_read_event(); + if (count < sizeof(unsigned long)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -404,8 +574,8 @@ if (rtc_status & RTC_TIMER_ON) { spin_lock_irq (&rtc_lock); rtc_status &= ~RTC_TIMER_ON; - del_timer(&rtc_irq_timer); spin_unlock_irq (&rtc_lock); + del_timer(&rtc_irq_timer); } return 0; } @@ -423,9 +593,9 @@ if (!(rtc_status & RTC_TIMER_ON)) { spin_lock_irq (&rtc_lock); rtc_irq_timer.expires = jiffies + HZ/rtc_freq + 2*HZ/100; - add_timer(&rtc_irq_timer); rtc_status |= RTC_TIMER_ON; spin_unlock_irq (&rtc_lock); + add_timer(&rtc_irq_timer); } set_rtc_irq_bit(RTC_PIE); return 0; @@ -583,6 +753,11 @@ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -592,6 +767,7 @@ CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -689,6 +865,7 @@ if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -710,6 +887,7 @@ { #ifdef RTC_IRQ unsigned char tmp; + int del; if (rtc_has_irq == 0) goto no_irq; @@ -728,11 +906,14 @@ CMOS_WRITE(tmp, RTC_CONTROL); CMOS_READ(RTC_INTR_FLAGS); } + del = 0; if (rtc_status & RTC_TIMER_ON) { rtc_status &= ~RTC_TIMER_ON; - del_timer(&rtc_irq_timer); + del = 1; } spin_unlock_irq(&rtc_lock); + if (del) + del_timer(&rtc_irq_timer); if (file->f_flags & FASYNC) { rtc_fasync (-1, file, 0); @@ -744,6 +925,7 @@ rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -808,6 +990,7 @@ return -EIO; #else unsigned char tmp; + int del; spin_lock_irq(&rtc_lock); spin_lock(&rtc_task_lock); @@ -827,12 +1010,15 @@ CMOS_WRITE(tmp, RTC_CONTROL); CMOS_READ(RTC_INTR_FLAGS); } + del = 0; if (rtc_status & RTC_TIMER_ON) { rtc_status &= ~RTC_TIMER_ON; - del_timer(&rtc_irq_timer); + del = 1; } rtc_status &= ~RTC_IS_OPEN; spin_unlock(&rtc_task_lock); + if (del) + del_timer(&rtc_irq_timer); spin_unlock_irq(&rtc_lock); return 0; #endif @@ -894,7 +1080,6 @@ struct proc_dir_entry *ent; #if defined(__alpha__) || defined(__mips__) unsigned int year, ctrl; - unsigned long uip_watchdog; char *guess = NULL; #endif #ifdef __sparc__ @@ -1000,12 +1185,8 @@ /* Each operating system on an Alpha uses its own epoch. Let's try to guess which one we are using now. */ - uip_watchdog = jiffies; if (rtc_is_updating() != 0) - while (jiffies - uip_watchdog < 2*HZ/100) { - barrier(); - cpu_relax(); - } + msleep(2*HZ/100); spin_lock_irq(&rtc_lock); year = CMOS_READ(RTC_YEAR); @@ -1097,6 +1278,7 @@ static void rtc_dropped_irq(unsigned long data) { unsigned long freq; + int mod; spin_lock_irq (&rtc_lock); @@ -1106,8 +1288,9 @@ } /* Just in case someone disabled the timer from behind our back... */ + mod = 0; if (rtc_status & RTC_TIMER_ON) - mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); + mod = 1; rtc_irq_data += ((rtc_freq/HZ)<<8); rtc_irq_data &= ~0xff; @@ -1116,6 +1299,8 @@ freq = rtc_freq; spin_unlock_irq(&rtc_lock); + if (mod) + mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); @@ -1213,7 +1398,6 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm) { - unsigned long uip_watchdog = jiffies; unsigned char ctrl; #ifdef CONFIG_MACH_DECSTATION unsigned int real_year; @@ -1221,19 +1405,15 @@ /* * read RTC once any update in progress is done. The update - * can take just over 2ms. We wait 10 to 20ms. There is no need to + * can take just over 2ms. We wait 20ms. There is no need to * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP. * If you need to know *exactly* when a second has started, enable * periodic update complete interrupts, (via ioctl) and then * immediately read /dev/rtc which will block until you get the IRQ. * Once the read clears, read the RTC time (again via ioctl). Easy. */ - if (rtc_is_updating() != 0) - while (jiffies - uip_watchdog < 2*HZ/100) { - barrier(); - cpu_relax(); - } + msleep(2*HZ/100); /* * Only the values that we read from the RTC are set. We leave Index: 2.6-8xx/drivers/char/hangcheck-timer.c =================================================================== --- 2.6-8xx.orig/drivers/char/hangcheck-timer.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/hangcheck-timer.c 2005-06-16 13:45:08.000000000 -0300 @@ -149,8 +149,7 @@ static void hangcheck_fire(unsigned long); -static struct timer_list hangcheck_ticktock = - TIMER_INITIALIZER(hangcheck_fire, 0, 0); +static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire, 0, 0); static void hangcheck_fire(unsigned long data) Index: 2.6-8xx/drivers/char/cyclades.c =================================================================== --- 2.6-8xx.orig/drivers/char/cyclades.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/cyclades.c 2005-06-16 13:45:08.000000000 -0300 @@ -865,7 +865,7 @@ static long cyz_polling_cycle = CZ_DEF_POLL; static int cyz_timeron = 0; -static struct timer_list cyz_timerlist = TIMER_INITIALIZER(cyz_poll, 0, 0); +static DEFINE_TIMER(cyz_timerlist, cyz_poll, 0, 0); #else /* CONFIG_CYZ_INTR */ static void cyz_rx_restart(unsigned long); Index: 2.6-8xx/drivers/char/sx.c =================================================================== --- 2.6-8xx.orig/drivers/char/sx.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/sx.c 2005-06-16 13:45:08.000000000 -0300 @@ -2321,7 +2321,7 @@ #ifdef NEW_WRITE_LOCKING port->gs.port_write_sem = MUTEX; #endif - port->gs.driver_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&port->gs.driver_lock); /* * Initializing wait queue */ Index: 2.6-8xx/drivers/char/Makefile =================================================================== --- 2.6-8xx.orig/drivers/char/Makefile 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/Makefile 2005-06-16 13:45:08.000000000 -0300 @@ -57,6 +57,8 @@ obj-$(CONFIG_APPLICOM) += applicom.o obj-$(CONFIG_SONYPI) += sonypi.o obj-$(CONFIG_RTC) += rtc.o +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o obj-$(CONFIG_HPET) += hpet.o obj-$(CONFIG_GEN_RTC) += genrtc.o obj-$(CONFIG_EFI_RTC) += efirtc.o Index: 2.6-8xx/drivers/char/keyboard.c =================================================================== --- 2.6-8xx.orig/drivers/char/keyboard.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/keyboard.c 2005-06-16 13:45:08.000000000 -0300 @@ -233,8 +233,7 @@ } } -static struct timer_list kd_mksound_timer = - TIMER_INITIALIZER(kd_nosound, 0, 0); +static DEFINE_TIMER(kd_mksound_timer, kd_nosound, 0, 0); void kd_mksound(unsigned int hz, unsigned int ticks) { Index: 2.6-8xx/drivers/char/ip2main.c =================================================================== --- 2.6-8xx.orig/drivers/char/ip2main.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/ip2main.c 2005-06-16 13:45:08.000000000 -0300 @@ -255,7 +255,7 @@ * selected, the board is serviced periodically to see if anything needs doing. */ #define POLL_TIMEOUT (jiffies + 1) -static struct timer_list PollTimer = TIMER_INITIALIZER(ip2_poll, 0, 0); +static DEFINE_TIMER(PollTimer, ip2_poll, 0, 0); static char TimerOn; #ifdef IP2DEBUG_TRACE Index: 2.6-8xx/drivers/char/random.c =================================================================== --- 2.6-8xx.orig/drivers/char/random.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/char/random.c 2005-06-16 13:45:08.000000000 -0300 @@ -417,7 +417,6 @@ .poolinfo = &poolinfo_table[0], .name = "input", .limit = 1, - .lock = SPIN_LOCK_UNLOCKED, .pool = input_pool_data }; @@ -426,7 +425,6 @@ .name = "blocking", .limit = 1, .pull = &input_pool, - .lock = SPIN_LOCK_UNLOCKED, .pool = blocking_pool_data }; @@ -434,7 +432,6 @@ .poolinfo = &poolinfo_table[1], .name = "nonblocking", .pull = &input_pool, - .lock = SPIN_LOCK_UNLOCKED, .pool = nonblocking_pool_data }; @@ -581,8 +578,11 @@ preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -627,9 +627,6 @@ if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } extern void add_input_randomness(unsigned int type, unsigned int code, @@ -894,9 +891,15 @@ static int __init rand_initialize(void) { + spin_lock_init(&input_pool.lock); init_std_data(&input_pool); + + spin_lock_init(&blocking_pool.lock); init_std_data(&blocking_pool); + + spin_lock_init(&nonblocking_pool.lock); init_std_data(&nonblocking_pool); + return 0; } module_init(rand_initialize); Index: 2.6-8xx/drivers/atm/iphase.c =================================================================== --- 2.6-8xx.orig/drivers/atm/iphase.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/atm/iphase.c 2005-06-16 13:45:08.000000000 -0300 @@ -79,7 +79,7 @@ static struct atm_dev *_ia_dev[8]; static int iadev_count; static void ia_led_timer(unsigned long arg); -static struct timer_list ia_timer = TIMER_INITIALIZER(ia_led_timer, 0, 0); +static DEFINE_TIMER(ia_timer, ia_led_timer, 0, 0); static int IA_TX_BUF = DFL_TX_BUFFERS, IA_TX_BUF_SZ = DFL_TX_BUF_SZ; static int IA_RX_BUF = DFL_RX_BUFFERS, IA_RX_BUF_SZ = DFL_RX_BUF_SZ; static uint IADebugFlag = /* IF_IADBG_ERR | IF_IADBG_CBR| IF_IADBG_INIT_ADAPTER Index: 2.6-8xx/drivers/atm/idt77105.c =================================================================== --- 2.6-8xx.orig/drivers/atm/idt77105.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/atm/idt77105.c 2005-06-16 13:45:08.000000000 -0300 @@ -50,10 +50,8 @@ static void idt77105_restart_timer_func(unsigned long); -static struct timer_list stats_timer = - TIMER_INITIALIZER(idt77105_stats_timer_func, 0, 0); -static struct timer_list restart_timer = - TIMER_INITIALIZER(idt77105_restart_timer_func, 0, 0); +static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func, 0, 0); +static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func, 0, 0); static int start_timer = 1; static struct idt77105_priv *idt77105_all = NULL; Index: 2.6-8xx/drivers/atm/atmtcp.c =================================================================== --- 2.6-8xx.orig/drivers/atm/atmtcp.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/atm/atmtcp.c 2005-06-16 13:45:08.000000000 -0300 @@ -368,7 +368,7 @@ .ops = &atmtcp_c_dev_ops, .type = "atmtcp", .number = 999, - .lock = SPIN_LOCK_UNLOCKED + .lock = SPIN_LOCK_UNLOCKED(atmtcp_control_dev.lock) }; Index: 2.6-8xx/drivers/input/joystick/analog.c =================================================================== --- 2.6-8xx.orig/drivers/input/joystick/analog.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/input/joystick/analog.c 2005-06-16 13:45:08.000000000 -0300 @@ -140,12 +140,14 @@ */ #ifdef __i386__ + +#include + #define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0) #define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? CLOCK_TICK_RATE / HZ : 0))) #define TIME_NAME (cpu_has_tsc?"TSC":"PIT") static unsigned int get_time_pit(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; Index: 2.6-8xx/drivers/input/gameport/gameport.c =================================================================== --- 2.6-8xx.orig/drivers/input/gameport/gameport.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/input/gameport/gameport.c 2005-06-16 13:45:08.000000000 -0300 @@ -61,12 +61,13 @@ #if defined(__i386__) +#include + #define DELTA(x,y) ((y)-(x)+((y)<(x)?1193182/HZ:0)) #define GET_TIME(x) do { x = get_time_pit(); } while (0) static unsigned int get_time_pit(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; Index: 2.6-8xx/drivers/oprofile/oprofilefs.c =================================================================== --- 2.6-8xx.orig/drivers/oprofile/oprofilefs.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/oprofile/oprofilefs.c 2005-06-16 13:45:08.000000000 -0300 @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) { Index: 2.6-8xx/drivers/cpufreq/cpufreq.c =================================================================== --- 2.6-8xx.orig/drivers/cpufreq/cpufreq.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/cpufreq/cpufreq.c 2005-06-16 13:45:08.000000000 -0300 @@ -605,7 +605,8 @@ policy->cpu = cpu; policy->cpus = cpumask_of_cpu(cpu); - init_MUTEX_LOCKED(&policy->lock); + init_MUTEX(&policy->lock); + down(&policy->lock); init_completion(&policy->kobj_unregister); INIT_WORK(&policy->update, handle_update, (void *)(long)cpu); Index: 2.6-8xx/drivers/parisc/iosapic.c =================================================================== --- 2.6-8xx.orig/drivers/parisc/iosapic.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/parisc/iosapic.c 2005-06-16 13:45:08.000000000 -0300 @@ -215,7 +215,7 @@ #define IOSAPIC_IRDT_ID_EID_SHIFT 0x10 -static spinlock_t iosapic_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(iosapic_lock); static inline void iosapic_eoi(void __iomem *addr, unsigned int data) { Index: 2.6-8xx/drivers/scsi/aic7xxx/aic79xx_osm.h =================================================================== --- 2.6-8xx.orig/drivers/scsi/aic7xxx/aic79xx_osm.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/scsi/aic7xxx/aic79xx_osm.h 2005-06-16 13:45:08.000000000 -0300 @@ -529,9 +529,9 @@ struct timer_list completeq_timer; struct timer_list reset_timer; struct timer_list stats_timer; - struct semaphore eh_sem; - struct semaphore dv_sem; - struct semaphore dv_cmd_sem; /* XXX This needs to be in + struct compat_semaphore eh_sem; + struct compat_semaphore dv_sem; + struct compat_semaphore dv_cmd_sem; /* XXX This needs to be in * the target struct */ struct scsi_device *dv_scsi_dev; Index: 2.6-8xx/drivers/scsi/aic7xxx/aic7xxx_osm.h =================================================================== --- 2.6-8xx.orig/drivers/scsi/aic7xxx/aic7xxx_osm.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/scsi/aic7xxx/aic7xxx_osm.h 2005-06-16 13:45:08.000000000 -0300 @@ -450,7 +450,7 @@ spinlock_t spin_lock; u_int qfrozen; struct timer_list reset_timer; - struct semaphore eh_sem; + struct compat_semaphore eh_sem; struct Scsi_Host *host; /* pointer to scsi host */ #define AHC_LINUX_NOIRQ ((uint32_t)~0) uint32_t irq; /* IRQ for this adapter */ Index: 2.6-8xx/drivers/scsi/aacraid/aacraid.h =================================================================== --- 2.6-8xx.orig/drivers/scsi/aacraid/aacraid.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/scsi/aacraid/aacraid.h 2005-06-16 13:45:08.000000000 -0300 @@ -666,7 +666,7 @@ u32 unique; // unique value representing this context ulong jiffies; // used for cleanup - dmb changed to ulong struct list_head next; // used to link context's into a linked list - struct semaphore wait_sem; // this is used to wait for the next fib to arrive. + struct compat_semaphore wait_sem; // this is used to wait for the next fib to arrive. int wait; // Set to true when thread is in WaitForSingleObject unsigned long count; // total number of FIBs on FibList struct list_head fib_list; // this holds fibs and their attachd hw_fibs @@ -733,7 +733,7 @@ * This is the event the sendfib routine will wait on if the * caller did not pass one and this is synch io. */ - struct semaphore event_wait; + struct compat_semaphore event_wait; spinlock_t event_lock; u32 done; /* gets set to 1 when fib is complete */ Index: 2.6-8xx/drivers/scsi/pluto.c =================================================================== --- 2.6-8xx.orig/drivers/scsi/pluto.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/scsi/pluto.c 2005-06-16 13:45:08.000000000 -0300 @@ -95,8 +95,7 @@ int i, retry, nplutos; fc_channel *fc; Scsi_Device dev; - struct timer_list fc_timer = - TIMER_INITIALIZER(pluto_detect_timeout, 0, 0); + struct DEFINE_TIMER(fc_timer, pluto_detect_timeout, 0, 0); tpnt->proc_name = "pluto"; fcscount = 0; Index: 2.6-8xx/drivers/ieee1394/raw1394-private.h =================================================================== --- 2.6-8xx.orig/drivers/ieee1394/raw1394-private.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/ieee1394/raw1394-private.h 2005-06-16 13:45:08.000000000 -0300 @@ -29,7 +29,7 @@ struct list_head req_pending; struct list_head req_complete; - struct semaphore complete_sem; + struct compat_semaphore complete_sem; spinlock_t reqlists_lock; wait_queue_head_t poll_wait_complete; Index: 2.6-8xx/drivers/ieee1394/ieee1394_types.h =================================================================== --- 2.6-8xx.orig/drivers/ieee1394/ieee1394_types.h 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/ieee1394/ieee1394_types.h 2005-06-16 13:45:08.000000000 -0300 @@ -19,7 +19,7 @@ spinlock_t lock; u8 next; u32 allocations; - struct semaphore count; + struct compat_semaphore count; }; #define HPSB_TPOOL_INIT(_tp) \ @@ -28,7 +28,7 @@ spin_lock_init(&(_tp)->lock); \ (_tp)->next = 0; \ (_tp)->allocations = 0; \ - sema_init(&(_tp)->count, 63); \ + sema_init(&(_tp)->count, 63); \ } while (0) Index: 2.6-8xx/drivers/ieee1394/nodemgr.c =================================================================== --- 2.6-8xx.orig/drivers/ieee1394/nodemgr.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/ieee1394/nodemgr.c 2005-06-16 13:45:08.000000000 -0300 @@ -114,7 +114,7 @@ struct hpsb_host *host; struct list_head list; struct completion exited; - struct semaphore reset_sem; + struct compat_semaphore reset_sem; int pid; char daemon_name[15]; int kill_me; Index: 2.6-8xx/drivers/sbus/char/cpwatchdog.c =================================================================== --- 2.6-8xx.orig/drivers/sbus/char/cpwatchdog.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/sbus/char/cpwatchdog.c 2005-06-16 13:45:08.000000000 -0300 @@ -155,7 +155,7 @@ }; static struct wd_device wd_dev = { - 0, SPIN_LOCK_UNLOCKED, 0, 0, 0, 0, + 0, SPIN_LOCK_UNLOCKED(wd_dev.lock), 0, 0, 0, 0, }; static struct timer_list wd_timer; Index: 2.6-8xx/drivers/sbus/char/aurora.c =================================================================== --- 2.6-8xx.orig/drivers/sbus/char/aurora.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/sbus/char/aurora.c 2005-06-16 13:45:08.000000000 -0300 @@ -871,8 +871,7 @@ #ifdef AURORA_INT_DEBUG static void aurora_timer (unsigned long ignored); -static struct timer_list aurora_poll_timer = - TIMER_INITIALIZER(aurora_timer, 0, 0); +static DEFINE_TIMER(aurora_poll_timer, aurora_timer, 0, 0); static void aurora_timer (unsigned long ignored) Index: 2.6-8xx/drivers/cdrom/aztcd.c =================================================================== --- 2.6-8xx.orig/drivers/cdrom/aztcd.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/cdrom/aztcd.c 2005-06-16 13:45:08.000000000 -0300 @@ -297,7 +297,7 @@ static int AztTimeout, AztTries; static DECLARE_WAIT_QUEUE_HEAD(azt_waitq); -static struct timer_list delay_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(delay_timer, NULL, 0, 0); static struct azt_DiskInfo DiskInfo; static struct azt_Toc Toc[MAX_TRACKS]; Index: 2.6-8xx/drivers/cdrom/sjcd.c =================================================================== --- 2.6-8xx.orig/drivers/cdrom/sjcd.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/cdrom/sjcd.c 2005-06-16 13:45:08.000000000 -0300 @@ -151,7 +151,7 @@ /* * Timer. */ -static struct timer_list sjcd_delay_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(sjcd_delay_timer, NULL, 0, 0); #define SJCD_SET_TIMER( func, tmout ) \ ( sjcd_delay_timer.expires = jiffies+tmout, \ Index: 2.6-8xx/drivers/cdrom/sbpcd.c =================================================================== --- 2.6-8xx.orig/drivers/cdrom/sbpcd.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/cdrom/sbpcd.c 2005-06-16 13:45:08.000000000 -0300 @@ -742,13 +742,10 @@ unsigned long cli_sti; /* for saving the processor flags */ #endif /*==========================================================================*/ -static struct timer_list delay_timer = - TIMER_INITIALIZER(mark_timeout_delay, 0, 0); -static struct timer_list data_timer = - TIMER_INITIALIZER(mark_timeout_data, 0, 0); +static DEFINE_TIMER(delay_timer, mark_timeout_delay, 0, 0); +static DEFINE_TIMER(data_timer, mark_timeout_data, 0, 0); #if 0 -static struct timer_list audio_timer = - TIMER_INITIALIZER(mark_timeout_audio, 0, 0); +static DEFINE_TIMER(audio_timer, mark_timeout_audio, 0, 0); #endif /*==========================================================================*/ /* Index: 2.6-8xx/drivers/cdrom/optcd.c =================================================================== --- 2.6-8xx.orig/drivers/cdrom/optcd.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/cdrom/optcd.c 2005-06-16 13:45:08.000000000 -0300 @@ -264,7 +264,7 @@ static int sleep_timeout; /* max # of ticks to sleep */ static DECLARE_WAIT_QUEUE_HEAD(waitq); static void sleep_timer(unsigned long data); -static struct timer_list delay_timer = TIMER_INITIALIZER(sleep_timer, 0, 0); +static DEFINE_TIMER(delay_timer, sleep_timer, 0, 0); static DEFINE_SPINLOCK(optcd_lock); static struct request_queue *opt_queue; Index: 2.6-8xx/drivers/cdrom/gscd.c =================================================================== --- 2.6-8xx.orig/drivers/cdrom/gscd.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/cdrom/gscd.c 2005-06-16 13:45:08.000000000 -0300 @@ -146,7 +146,7 @@ static int AudioEnd_m; static int AudioEnd_f; -static struct timer_list gscd_timer = TIMER_INITIALIZER(NULL, 0, 0); +static DEFINE_TIMER(gscd_timer, NULL, 0, 0); static DEFINE_SPINLOCK(gscd_lock); static struct request_queue *gscd_queue; Index: 2.6-8xx/drivers/video/backlight/corgi_bl.c =================================================================== --- 2.6-8xx.orig/drivers/video/backlight/corgi_bl.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/video/backlight/corgi_bl.c 2005-06-16 13:45:08.000000000 -0300 @@ -29,7 +29,7 @@ static int corgibl_powermode = FB_BLANK_UNBLANK; static int current_intensity = 0; static int corgibl_limit = 0; -static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(bl_lock); static void corgibl_send_intensity(int intensity) { Index: 2.6-8xx/drivers/video/console/vgacon.c =================================================================== --- 2.6-8xx.orig/drivers/video/console/vgacon.c 2005-06-16 12:51:35.000000000 -0300 +++ 2.6-8xx/drivers/video/console/vgacon.c 2005-06-16 13:45:08.000000000 -0300 @@ -53,7 +53,7 @@ #include