X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=arch%2Fpowerpc%2Fplatforms%2Fpseries%2Feeh_event.c;h=8f2d12935b99fe47de2a273ad4b12338fbcb5f4f;hb=acf7d76827a577059636e949079021e6af6dd702;hp=92497333c2b65af53a4f007e2b1a09c5954f0f4a;hpb=97f2aab6698f3ab2552c41c1024a65ffd0763a6d;p=powerpc.git diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index 92497333c2..8f2d12935b 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -18,9 +18,13 @@ * Copyright (c) 2005 Linas Vepstas */ +#include #include +#include #include +#include #include +#include /** Overview: * EEH error states may be detected within exception handlers; @@ -36,66 +40,60 @@ LIST_HEAD(eeh_eventlist); static void eeh_thread_launcher(void *); DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); -/** - * eeh_panic - call panic() for an eeh event that cannot be handled. - * The philosophy of this routine is that it is better to panic and - * halt the OS than it is to risk possible data corruption by - * oblivious device drivers that don't know better. - * - * @dev pci device that had an eeh event - * @reset_state current reset state of the device slot - */ -static void eeh_panic(struct pci_dev *dev, int reset_state) -{ - /* - * Since the panic_on_oops sysctl is used to halt the system - * in light of potential corruption, we can use it here. - */ - if (panic_on_oops) { - panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, - pci_name(dev)); - } - else { - printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", - reset_state, pci_name(dev)); - } -} +/* Serialize reset sequences for a given pci device */ +DEFINE_MUTEX(eeh_event_mutex); /** - * eeh_event_handler - dispatch EEH events. The detection of a frozen - * slot can occur inside an interrupt, where it can be hard to do - * anything about it. The goal of this routine is to pull these - * detection events out of the context of the interrupt handler, and - * re-dispatch them for processing at a later time in a normal context. - * + * eeh_event_handler - dispatch EEH events. * @dummy - unused + * + * The detection of a frozen slot can occur inside an interrupt, + * where it can be hard to do anything about it. The goal of this + * routine is to pull these detection events out of the context + * of the interrupt handler, and re-dispatch them for processing + * at a later time in a normal context. */ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; + struct pci_dn *pdn; daemonize ("eehd"); + set_current_state(TASK_INTERRUPTIBLE); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + + /* Unqueue the event, get ready to process. */ + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + if (event == NULL) + return 0; - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - if (event == NULL) - break; + /* Serialize processing of EEH events */ + mutex_lock(&eeh_event_mutex); + eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); - printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - pci_name(event->dev)); + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", + pci_name(event->dev)); - eeh_panic (event->dev, event->state); + pdn = handle_eeh_events(event); - kfree(event); + eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); + pci_dev_put(event->dev); + kfree(event); + mutex_unlock(&eeh_event_mutex); + + /* If there are no new errors after an hour, clear the counter. */ + if (pdn && pdn->eeh_freeze_count>0) { + msleep_interruptible (3600*1000); + if (pdn->eeh_freeze_count>0) + pdn->eeh_freeze_count--; } return 0; @@ -103,7 +101,6 @@ static int eeh_event_handler(void * dummy) /** * eeh_thread_launcher - * * @dummy - unused */ static void eeh_thread_launcher(void *dummy) @@ -122,12 +119,20 @@ static void eeh_thread_launcher(void *dummy) */ int eeh_send_failure_event (struct device_node *dn, struct pci_dev *dev, - int state, + enum pci_channel_state state, int time_unavail) { unsigned long flags; struct eeh_event *event; + char *location; + if (!mem_init_done) { + printk(KERN_ERR "EEH: event during early boot not handled\n"); + location = (char *) get_property(dn, "ibm,loc-code", NULL); + printk(KERN_ERR "EEH: device node = %s\n", dn->full_name); + printk(KERN_ERR "EEH: PCI location = %s\n", location); + return 1; + } event = kmalloc(sizeof(*event), GFP_ATOMIC); if (event == NULL) { printk (KERN_ERR "EEH: out of memory, event not handled\n");