drivers/scsi/scsi_error.c

   1 /*
   2  *  scsi_error.c Copyright (C) 1997 Eric Youngdale
   3  *
   4  *  SCSI error/timeout handling
   5  *      Initial versions: Eric Youngdale.  Based upon conversations with
   6  *                        Leonard Zubkoff and David Miller at Linux Expo,
   7  *                        ideas originating from all over the place.
   8  *
   9  */
  10
  11 #define __NO_VERSION__
  12 #include <linux/module.h>
  13
  14 #include <linux/sched.h>
  15 #include <linux/timer.h>
  16 #include <linux/string.h>
  17 #include <linux/slab.h>
  18 #include <linux/ioport.h>
  19 #include <linux/kernel.h>
  20 #include <linux/stat.h>
  21 #include <linux/blk.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/delay.h>
  24 #include <linux/smp_lock.h>
  25
  26 #define __KERNEL_SYSCALLS__
  27
  28 #include <linux/unistd.h>
  29
  30 #include <asm/system.h>
  31 #include <asm/irq.h>
  32 #include <asm/dma.h>
  33
  34 #include "scsi.h"
  35 #include "hosts.h"
  36 #include "constants.h"
  37
  38 /*
  39  * We must always allow SHUTDOWN_SIGS.  Even if we are not a module,
  40  * the host drivers that we are using may be loaded as modules, and
  41  * when we unload these,  we need to ensure that the error handler thread
  42  * can be shut down.
  43  *
  44  * Note - when we unload a module, we send a SIGHUP.  We mustn't
  45  * enable SIGTERM, as this is how the init shuts things down when you
  46  * go to single-user mode.  For that matter, init also sends SIGKILL,
  47  * so we mustn't enable that one either.  We use SIGHUP instead.  Other
  48  * options would be SIGPWR, I suppose.
  49  */
  50 #define SHUTDOWN_SIGS   (sigmask(SIGHUP))
  51
  52 #ifdef DEBUG
  53 #define SENSE_TIMEOUT SCSI_TIMEOUT
  54 #define ABORT_TIMEOUT SCSI_TIMEOUT
  55 #define RESET_TIMEOUT SCSI_TIMEOUT
  56 #else
  57 #define SENSE_TIMEOUT (10*HZ)
  58 #define RESET_TIMEOUT (2*HZ)
  59 #define ABORT_TIMEOUT (15*HZ)
  60 #endif
  61
  62 #define STATIC
  63
  64 /*
  65  * These should *probably* be handled by the host itself.
  66  * Since it is allowed to sleep, it probably should.
  67  */
  68 #define BUS_RESET_SETTLE_TIME   5*HZ
  69 #define HOST_RESET_SETTLE_TIME  10*HZ
  70
  71
  72 static const char RCSid[] = "$Header: /cvshome/samwise/ppclinux/drivers/scsi/scsi_error.c,v 1.1.1.1 2005/04/11 02:50:36 jack Exp $";
  73
  74 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
  75 STATIC int scsi_request_sense(Scsi_Cmnd *);
  76 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
  77 STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
  78 STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
  79 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
  80 STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
  81 STATIC int scsi_try_host_reset(Scsi_Cmnd *);
  82 STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
  83 STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
  84 STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
  85 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
  86 STATIC void scsi_restart_operations(struct Scsi_Host *);
  87 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
  88
  89
  90 /*
  91  * Function:    scsi_add_timer()
  92  *
  93  * Purpose:     Start timeout timer for a single scsi command.
  94  *
  95  * Arguments:   SCset   - command that is about to start running.
  96  *              timeout - amount of time to allow this command to run.
  97  *              complete - timeout function to call if timer isn't
  98  *                      canceled.
  99  *
 100  * Returns:     Nothing
 101  *
 102  * Notes:       This should be turned into an inline function.
 103  *
 104  * More Notes:  Each scsi command has it's own timer, and as it is added to
 105  *              the queue, we set up the timer.  When the command completes,
 106  *              we cancel the timer.  Pretty simple, really, especially
 107  *              compared to the old way of handling this crap.
 108  */
 109 void scsi_add_timer(Scsi_Cmnd * SCset,
 110                     int timeout,
 111                     void (*complete) (Scsi_Cmnd *))
 112 {
 113
 114         /*
 115          * If the clock was already running for this command, then
 116          * first delete the timer.  The timer handling code gets rather
 117          * confused if we don't do this.
 118          */
 119         if (SCset->eh_timeout.function != NULL) {
 120                 del_timer(&SCset->eh_timeout);
 121         }
 122         SCset->eh_timeout.data = (unsigned long) SCset;
 123         SCset->eh_timeout.expires = jiffies + timeout;
 124         SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
 125
 126         SCset->done_late = 0;
 127
 128         SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
 129
 130         add_timer(&SCset->eh_timeout);
 131
 132 }
 133
 134 /*
 135  * Function:    scsi_delete_timer()
 136  *
 137  * Purpose:     Delete/cancel timer for a given function.
 138  *
 139  * Arguments:   SCset   - command that we are canceling timer for.
 140  *
 141  * Returns:     1 if we were able to detach the timer.  0 if we
 142  *              blew it, and the timer function has already started
 143  *              to run.
 144  *
 145  * Notes:       This should be turned into an inline function.
 146  */
 147 int scsi_delete_timer(Scsi_Cmnd * SCset)
 148 {
 149         int rtn;
 150
 151         rtn = del_timer(&SCset->eh_timeout);
 152
 153         SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p %d\n", SCset, rtn));
 154
 155         SCset->eh_timeout.data = (unsigned long) NULL;
 156         SCset->eh_timeout.function = NULL;
 157
 158         return rtn;
 159 }
 160
 161 /*
 162  * Function:    scsi_times_out()
 163  *
 164  * Purpose:     Timeout function for normal scsi commands..
 165  *
 166  * Arguments:   SCpnt   - command that is timing out.
 167  *
 168  * Returns:     Nothing.
 169  *
 170  * Notes:       We do not need to lock this.  There is the potential for
 171  *              a race only in that the normal completion handling might
 172  *              run, but if the normal completion function determines
 173  *              that the timer has already fired, then it mustn't do
 174  *              anything.
 175  */
 176 void scsi_times_out(Scsi_Cmnd * SCpnt)
 177 {
 178         /*
 179          * Notify the low-level code that this operation failed and we are
 180          * reposessing the command.
 181          */
 182 #ifdef ERIC_neverdef
 183         /*
 184          * FIXME(eric)
 185          * Allow the host adapter to push a queue ordering tag
 186          * out to the bus to force the command in question to complete.
 187          * If the host wants to do this, then we just restart the timer
 188          * for the command.  Before we really do this, some real thought
 189          * as to the optimum way to handle this should be done.  We *do*
 190          * need to force ordering every so often to ensure that all requests
 191          * do eventually complete, but I am not sure if this is the best way
 192          * to actually go about it.
 193          *
 194          * Better yet, force a sync here, but don't block since we are in an
 195          * interrupt.
 196          */
 197         if (SCpnt->host->hostt->eh_ordered_queue_tag) {
 198                 if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
 199                         scsi_add_timer(SCpnt, SCpnt->internal_timeout,
 200                                        scsi_times_out);
 201                         return;
 202                 }
 203         }
 204         /*
 205          * FIXME(eric) - add a second special interface to handle this
 206          * case.  Ideally that interface can also be used to request
 207          * a queu
 208          */
 209         if (SCpnt->host->can_queue) {
 210                 SCpnt->host->hostt->queuecommand(SCpnt, NULL);
 211         }
 212 #endif
 213
 214         /* Set the serial_number_at_timeout to the current serial_number */
 215         SCpnt->serial_number_at_timeout = SCpnt->serial_number;
 216
 217         SCpnt->eh_state = FAILED;
 218         SCpnt->state = SCSI_STATE_TIMEOUT;
 219         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 220
 221         SCpnt->host->in_recovery = 1;
 222         SCpnt->host->host_failed++;
 223
 224         SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
 225                                    atomic_read(&SCpnt->host->host_active),
 226                                    SCpnt->host->host_busy,
 227                                    SCpnt->host->host_failed));
 228
 229         /*
 230          * If the host is having troubles, then look to see if this was the last
 231          * command that might have failed.  If so, wake up the error handler.
 232          */
 233         if( SCpnt->host->eh_wait == NULL ) {
 234                 panic("Error handler thread not present at %p %p %s %d",
 235                       SCpnt, SCpnt->host, __FILE__, __LINE__);
 236         }
 237         if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
 238                 up(SCpnt->host->eh_wait);
 239         }
 240 }
 241
 242 /*
 243  * Function     scsi_block_when_processing_errors
 244  *
 245  * Purpose:     Prevent more commands from being queued while error recovery
 246  *              is taking place.
 247  *
 248  * Arguments:   SDpnt - device on which we are performing recovery.
 249  *
 250  * Returns:     FALSE   The device was taken offline by error recovery.
 251  *              TRUE    OK to proceed.
 252  *
 253  * Notes:       We block until the host is out of error recovery, and then
 254  *              check to see whether the host or the device is offline.
 255  */
 256 int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
 257 {
 258
 259         SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
 260
 261         SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt->online));
 262
 263         return SDpnt->online;
 264 }
 265
 266 /*
 267  * Function:    scsi_eh_times_out()
 268  *
 269  * Purpose:     Timeout function for error handling.
 270  *
 271  * Arguments:   SCpnt   - command that is timing out.
 272  *
 273  * Returns:     Nothing.
 274  *
 275  * Notes:       During error handling, the kernel thread will be sleeping
 276  *              waiting for some action to complete on the device.  Our only
 277  *              job is to record that it timed out, and to wake up the
 278  *              thread.
 279  */
 280 STATIC
 281 void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
 282 {
 283         SCpnt->eh_state = SCSI_STATE_TIMEOUT;
 284         SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt));
 285
 286         if (SCpnt->host->eh_action != NULL)
 287                 up(SCpnt->host->eh_action);
 288         else
 289                 printk("Missing scsi error handler thread\n");
 290 }
 291
 292
 293 /*
 294  * Function:    scsi_eh_done()
 295  *
 296  * Purpose:     Completion function for error handling.
 297  *
 298  * Arguments:   SCpnt   - command that is timing out.
 299  *
 300  * Returns:     Nothing.
 301  *
 302  * Notes:       During error handling, the kernel thread will be sleeping
 303  *              waiting for some action to complete on the device.  Our only
 304  *              job is to record that the action completed, and to wake up the
 305  *              thread.
 306  */
 307 STATIC
 308 void scsi_eh_done(Scsi_Cmnd * SCpnt)
 309 {
 310         int     rtn;
 311
 312         /*
 313          * If the timeout handler is already running, then just set the
 314          * flag which says we finished late, and return.  We have no
 315          * way of stopping the timeout handler from running, so we must
 316          * always defer to it.
 317          */
 318         rtn = del_timer(&SCpnt->eh_timeout);
 319         if (!rtn) {
 320                 SCpnt->done_late = 1;
 321                 return;
 322         }
 323
 324         SCpnt->request.rq_status = RQ_SCSI_DONE;
 325
 326         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 327         SCpnt->eh_state = SUCCESS;
 328
 329         SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt,
 330                                           SCpnt->result));
 331
 332         if (SCpnt->host->eh_action != NULL)
 333                 up(SCpnt->host->eh_action);
 334 }
 335
 336 /*
 337  * Function:    scsi_eh_action_done()
 338  *
 339  * Purpose:     Completion function for error handling.
 340  *
 341  * Arguments:   SCpnt   - command that is timing out.
 342  *              answer  - boolean that indicates whether operation succeeded.
 343  *
 344  * Returns:     Nothing.
 345  *
 346  * Notes:       This callback is only used for abort and reset operations.
 347  */
 348 STATIC
 349 void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
 350 {
 351         SCpnt->request.rq_status = RQ_SCSI_DONE;
 352
 353         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 354         SCpnt->eh_state = (answer ? SUCCESS : FAILED);
 355
 356         if (SCpnt->host->eh_action != NULL)
 357                 up(SCpnt->host->eh_action);
 358 }
 359
 360 /*
 361  * Function:  scsi_sense_valid()
 362  *
 363  * Purpose:     Determine whether a host has automatically obtained sense
 364  *              information or not.  If we have it, then give a recommendation
 365  *              as to what we should do next.
 366  */
 367 int scsi_sense_valid(Scsi_Cmnd * SCpnt)
 368 {
 369         if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
 370                 return FALSE;
 371         }
 372         return TRUE;
 373 }
 374
 375 /*
 376  * Function:  scsi_eh_retry_command()
 377  *
 378  * Purpose:     Retry the original command
 379  *
 380  * Returns:     SUCCESS - we were able to get the sense data.
 381  *              FAILED  - we were not able to get the sense data.
 382  *
 383  * Notes:       This function will *NOT* return until the command either
 384  *              times out, or it completes.
 385  */
 386 STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
 387 {
 388         memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
 389                sizeof(SCpnt->data_cmnd));
 390         SCpnt->request_buffer = SCpnt->buffer;
 391         SCpnt->request_bufflen = SCpnt->bufflen;
 392         SCpnt->use_sg = SCpnt->old_use_sg;
 393         SCpnt->cmd_len = SCpnt->old_cmd_len;
 394         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 395         SCpnt->underflow = SCpnt->old_underflow;
 396
 397         scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
 398
 399         /*
 400          * Hey, we are done.  Let's look to see what happened.
 401          */
 402         return SCpnt->eh_state;
 403 }
 404
 405 /*
 406  * Function:  scsi_request_sense()
 407  *
 408  * Purpose:     Request sense data from a particular target.
 409  *
 410  * Returns:     SUCCESS - we were able to get the sense data.
 411  *              FAILED  - we were not able to get the sense data.
 412  *
 413  * Notes:       Some hosts automatically obtain this information, others
 414  *              require that we obtain it on our own.
 415  *
 416  *              This function will *NOT* return until the command either
 417  *              times out, or it completes.
 418  */
 419 STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
 420 {
 421         static unsigned char generic_sense[6] =
 422         {REQUEST_SENSE, 0, 0, 0, 255, 0};
 423         unsigned char scsi_result0[256], *scsi_result = NULL;
 424         int saved_result;
 425
 426         ASSERT_LOCK(&io_request_lock, 0);
 427
 428         memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
 429                sizeof(generic_sense));
 430
 431         if (SCpnt->device->scsi_level <= SCSI_2)
 432                 SCpnt->cmnd[1] = SCpnt->lun << 5;
 433
 434         scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
 435             ? &scsi_result0[0] : kmalloc(512, GFP_ATOMIC | GFP_DMA);
 436
 437         if (scsi_result == NULL) {
 438                 printk("cannot allocate scsi_result in scsi_request_sense.\n");
 439                 return FAILED;
 440         }
 441         /*
 442          * Zero the sense buffer.  Some host adapters automatically always request
 443          * sense, so it is not a good idea that SCpnt->request_buffer and
 444          * SCpnt->sense_buffer point to the same address (DB).
 445          * 0 is not a valid sense code.
 446          */
 447         memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
 448         memset((void *) scsi_result, 0, 256);
 449
 450         saved_result = SCpnt->result;
 451         SCpnt->request_buffer = scsi_result;
 452         SCpnt->request_bufflen = 256;
 453         SCpnt->use_sg = 0;
 454         SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
 455         SCpnt->sc_data_direction = SCSI_DATA_READ;
 456         SCpnt->underflow = 0;
 457
 458         scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
 459
 460         /* Last chance to have valid sense data */
 461         if (!scsi_sense_valid(SCpnt))
 462                 memcpy((void *) SCpnt->sense_buffer,
 463                        SCpnt->request_buffer,
 464                        sizeof(SCpnt->sense_buffer));
 465
 466         if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
 467                 kfree(scsi_result);
 468
 469         /*
 470          * When we eventually call scsi_finish, we really wish to complete
 471          * the original request, so let's restore the original data. (DB)
 472          */
 473         memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
 474                sizeof(SCpnt->data_cmnd));
 475         SCpnt->result = saved_result;
 476         SCpnt->request_buffer = SCpnt->buffer;
 477         SCpnt->request_bufflen = SCpnt->bufflen;
 478         SCpnt->use_sg = SCpnt->old_use_sg;
 479         SCpnt->cmd_len = SCpnt->old_cmd_len;
 480         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 481         SCpnt->underflow = SCpnt->old_underflow;
 482
 483         /*
 484          * Hey, we are done.  Let's look to see what happened.
 485          */
 486         return SCpnt->eh_state;
 487 }
 488
 489 /*
 490  * Function:  scsi_test_unit_ready()
 491  *
 492  * Purpose:     Run test unit ready command to see if the device is talking to us or not.
 493  *
 494  */
 495 STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
 496 {
 497         static unsigned char tur_command[6] =
 498         {TEST_UNIT_READY, 0, 0, 0, 0, 0};
 499
 500         memcpy((void *) SCpnt->cmnd, (void *) tur_command,
 501                sizeof(tur_command));
 502
 503         if (SCpnt->device->scsi_level <= SCSI_2)
 504                 SCpnt->cmnd[1] = SCpnt->lun << 5;
 505
 506         /*
 507          * Zero the sense buffer.  The SCSI spec mandates that any
 508          * untransferred sense data should be interpreted as being zero.
 509          */
 510         memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
 511
 512         SCpnt->request_buffer = NULL;
 513         SCpnt->request_bufflen = 0;
 514         SCpnt->use_sg = 0;
 515         SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
 516         SCpnt->underflow = 0;
 517         SCpnt->sc_data_direction = SCSI_DATA_NONE;
 518
 519         scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
 520
 521         /*
 522          * When we eventually call scsi_finish, we really wish to complete
 523          * the original request, so let's restore the original data. (DB)
 524          */
 525         memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
 526                sizeof(SCpnt->data_cmnd));
 527         SCpnt->request_buffer = SCpnt->buffer;
 528         SCpnt->request_bufflen = SCpnt->bufflen;
 529         SCpnt->use_sg = SCpnt->old_use_sg;
 530         SCpnt->cmd_len = SCpnt->old_cmd_len;
 531         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 532         SCpnt->underflow = SCpnt->old_underflow;
 533
 534         /*
 535          * Hey, we are done.  Let's look to see what happened.
 536          */
 537         SCSI_LOG_ERROR_RECOVERY(3,
 538                 printk("scsi_test_unit_ready: SCpnt %p eh_state %x\n",
 539                 SCpnt, SCpnt->eh_state));
 540         return SCpnt->eh_state;
 541 }
 542
 543 /*
 544  * This would normally need to get the IO request lock,
 545  * but as it doesn't actually touch anything that needs
 546  * to be locked we can avoid the lock here..
 547  */
 548 STATIC
 549 void scsi_sleep_done(struct semaphore *sem)
 550 {
 551         if (sem != NULL) {
 552                 up(sem);
 553         }
 554 }
 555
 556 void scsi_sleep(int timeout)
 557 {
 558         DECLARE_MUTEX_LOCKED(sem);
 559         struct timer_list timer;
 560
 561         init_timer(&timer);
 562         timer.data = (unsigned long) &sem;
 563         timer.expires = jiffies + timeout;
 564         timer.function = (void (*)(unsigned long)) scsi_sleep_done;
 565
 566         SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout));
 567
 568         add_timer(&timer);
 569
 570         down(&sem);
 571         del_timer(&timer);
 572 }
 573
 574 /*
 575  * Function:  scsi_send_eh_cmnd
 576  *
 577  * Purpose:     Send a command out to a device as part of error recovery.
 578  *
 579  * Notes:       The initialization of the structures is quite a bit different
 580  *              in this case, and furthermore, there is a different completion
 581  *              handler.
 582  */
 583 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
 584 {
 585         unsigned long flags;
 586         struct Scsi_Host *host;
 587
 588         ASSERT_LOCK(&io_request_lock, 0);
 589
 590         host = SCpnt->host;
 591
 592       retry:
 593         /*
 594          * We will use a queued command if possible, otherwise we will emulate the
 595          * queuing and calling of completion function ourselves.
 596          */
 597         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 598
 599         if (host->can_queue) {
 600                 DECLARE_MUTEX_LOCKED(sem);
 601
 602                 SCpnt->eh_state = SCSI_STATE_QUEUED;
 603
 604                 scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
 605
 606                 /*
 607                  * Set up the semaphore so we wait for the command to complete.
 608                  */
 609                 SCpnt->host->eh_action = &sem;
 610                 SCpnt->request.rq_status = RQ_SCSI_BUSY;
 611
 612                 spin_lock_irqsave(&io_request_lock, flags);
 613                 host->hostt->queuecommand(SCpnt, scsi_eh_done);
 614                 spin_unlock_irqrestore(&io_request_lock, flags);
 615
 616                 down(&sem);
 617
 618                 SCpnt->host->eh_action = NULL;
 619
 620                 /*
 621                  * See if timeout.  If so, tell the host to forget about it.
 622                  * In other words, we don't want a callback any more.
 623                  */
 624                 if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
 625                         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 626
 627                         /*
 628                          * As far as the low level driver is
 629                          * concerned, this command is still active, so
 630                          * we must give the low level driver a chance
 631                          * to abort it. (DB)
 632                          *
 633                          * FIXME(eric) - we are not tracking whether we could
 634                          * abort a timed out command or not.  Not sure how
 635                          * we should treat them differently anyways.
 636                          */
 637                         spin_lock_irqsave(&io_request_lock, flags);
 638                         if (SCpnt->host->hostt->eh_abort_handler)
 639                                 SCpnt->host->hostt->eh_abort_handler(SCpnt);
 640                         spin_unlock_irqrestore(&io_request_lock, flags);
 641
 642                         SCpnt->request.rq_status = RQ_SCSI_DONE;
 643                         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 644
 645                         SCpnt->eh_state = FAILED;
 646                 }
 647                 SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
 648                                                 SCpnt, SCpnt->eh_state));
 649         } else {
 650                 int temp;
 651
 652                 /*
 653                  * We damn well had better never use this code.  There is no timeout
 654                  * protection here, since we would end up waiting in the actual low
 655                  * level driver, we don't know how to wake it up.
 656                  */
 657                 spin_lock_irqsave(&io_request_lock, flags);
 658                 temp = host->hostt->command(SCpnt);
 659                 spin_unlock_irqrestore(&io_request_lock, flags);
 660
 661                 SCpnt->result = temp;
 662                 /* Fall through to code below to examine status. */
 663                 SCpnt->eh_state = SUCCESS;
 664         }
 665
 666         /*
 667          * Now examine the actual status codes to see whether the command actually
 668          * did complete normally.
 669          */
 670         if (SCpnt->eh_state == SUCCESS) {
 671                 int ret = scsi_eh_completed_normally(SCpnt);
 672                 SCSI_LOG_ERROR_RECOVERY(3,
 673                         printk("scsi_send_eh_cmnd: scsi_eh_completed_normally %x\n", ret));
 674                 switch (ret) {
 675                 case SUCCESS:
 676                         SCpnt->eh_state = SUCCESS;
 677                         break;
 678                 case NEEDS_RETRY:
 679                         goto retry;
 680                 case FAILED:
 681                 default:
 682                         SCpnt->eh_state = FAILED;
 683                         break;
 684                 }
 685         } else {
 686                 SCpnt->eh_state = FAILED;
 687         }
 688 }
 689
 690 /*
 691  * Function:  scsi_unit_is_ready()
 692  *
 693  * Purpose:     Called after TEST_UNIT_READY is run, to test to see if
 694  *              the unit responded in a way that indicates it is ready.
 695  */
 696 STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
 697 {
 698         if (SCpnt->result) {
 699                 if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
 700                      (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
 701                     ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
 702                         if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
 703                             ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
 704                             ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
 705                                 return 0;
 706                         }
 707                 }
 708         }
 709         return 1;
 710 }
 711
 712 /*
 713  * Function:    scsi_eh_finish_command
 714  *
 715  * Purpose:     Handle a command that we are finished with WRT error handling.
 716  *
 717  * Arguments:   SClist - pointer to list into which we are putting completed commands.
 718  *              SCpnt  - command that is completing
 719  *
 720  * Notes:       We don't want to use the normal command completion while we are
 721  *              are still handling errors - it may cause other commands to be queued,
 722  *              and that would disturb what we are doing.  Thus we really want to keep
 723  *              a list of pending commands for final completion, and once we
 724  *              are ready to leave error handling we handle completion for real.
 725  */
 726 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
 727 {
 728         SCpnt->state = SCSI_STATE_BHQUEUE;
 729         SCpnt->bh_next = *SClist;
 730         /*
 731          * Set this back so that the upper level can correctly free up
 732          * things.
 733          */
 734         SCpnt->use_sg = SCpnt->old_use_sg;
 735         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 736         SCpnt->underflow = SCpnt->old_underflow;
 737         *SClist = SCpnt;
 738 }
 739
 740 /*
 741  * Function:  scsi_try_to_abort_command
 742  *
 743  * Purpose:     Ask host adapter to abort a running command.
 744  *
 745  * Returns:     FAILED          Operation failed or not supported.
 746  *              SUCCESS         Succeeded.
 747  *
 748  * Notes:       This function will not return until the user's completion
 749  *              function has been called.  There is no timeout on this
 750  *              operation.  If the author of the low-level driver wishes
 751  *              this operation to be timed, they can provide this facility
 752  *              themselves.  Helper functions in scsi_error.c can be supplied
 753  *              to make this easier to do.
 754  *
 755  * Notes:       It may be possible to combine this with all of the reset
 756  *              handling to eliminate a lot of code duplication.  I don't
 757  *              know what makes more sense at the moment - this is just a
 758  *              prototype.
 759  */
 760 STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
 761 {
 762         int rtn;
 763         unsigned long flags;
 764
 765         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 766
 767         if (SCpnt->host->hostt->eh_abort_handler == NULL) {
 768                 return FAILED;
 769         }
 770         /*
 771          * scsi_done was called just after the command timed out and before
 772          * we had a chance to process it. (DB)
 773          */
 774         if (SCpnt->serial_number == 0)
 775                 return SUCCESS;
 776
 777         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 778
 779         spin_lock_irqsave(&io_request_lock, flags);
 780         rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
 781         spin_unlock_irqrestore(&io_request_lock, flags);
 782         return rtn;
 783 }
 784
 785 /*
 786  * Function:  scsi_try_bus_device_reset
 787  *
 788  * Purpose:     Ask host adapter to perform a bus device reset for a given
 789  *              device.
 790  *
 791  * Returns:     FAILED          Operation failed or not supported.
 792  *              SUCCESS         Succeeded.
 793  *
 794  * Notes:       There is no timeout for this operation.  If this operation is
 795  *              unreliable for a given host, then the host itself needs to put a
 796  *              timer on it, and set the host back to a consistent state prior
 797  *              to returning.
 798  */
 799 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
 800 {
 801         unsigned long flags;
 802         int rtn;
 803
 804         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 805
 806         if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
 807                 return FAILED;
 808         }
 809         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 810
 811         spin_lock_irqsave(&io_request_lock, flags);
 812         rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
 813         spin_unlock_irqrestore(&io_request_lock, flags);
 814
 815         if (rtn == SUCCESS)
 816                 SCpnt->eh_state = SUCCESS;
 817
 818         return SCpnt->eh_state;
 819 }
 820
 821 /*
 822  * Function:  scsi_try_bus_reset
 823  *
 824  * Purpose:     Ask host adapter to perform a bus reset for a host.
 825  *
 826  * Returns:     FAILED          Operation failed or not supported.
 827  *              SUCCESS         Succeeded.
 828  *
 829  * Notes:
 830  */
 831 STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
 832 {
 833         unsigned long flags;
 834         int rtn;
 835
 836         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 837         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 838         SCpnt->serial_number_at_timeout = SCpnt->serial_number;
 839
 840         if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
 841                 return FAILED;
 842         }
 843
 844         spin_lock_irqsave(&io_request_lock, flags);
 845         rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
 846         spin_unlock_irqrestore(&io_request_lock, flags);
 847
 848         if (rtn == SUCCESS)
 849                 SCpnt->eh_state = SUCCESS;
 850
 851         /*
 852          * If we had a successful bus reset, mark the command blocks to expect
 853          * a condition code of unit attention.
 854          */
 855         scsi_sleep(BUS_RESET_SETTLE_TIME);
 856         if (SCpnt->eh_state == SUCCESS) {
 857                 Scsi_Device *SDloop;
 858                 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
 859                         if (SCpnt->channel == SDloop->channel) {
 860                                 SDloop->was_reset = 1;
 861                                 SDloop->expecting_cc_ua = 1;
 862                         }
 863                 }
 864         }
 865         return SCpnt->eh_state;
 866 }
 867
 868 /*
 869  * Function:  scsi_try_host_reset
 870  *
 871  * Purpose:     Ask host adapter to reset itself, and the bus.
 872  *
 873  * Returns:     FAILED          Operation failed or not supported.
 874  *              SUCCESS         Succeeded.
 875  *
 876  * Notes:
 877  */
 878 STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
 879 {
 880         unsigned long flags;
 881         int rtn;
 882
 883         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 884         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 885         SCpnt->serial_number_at_timeout = SCpnt->serial_number;
 886
 887         if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
 888                 return FAILED;
 889         }
 890         spin_lock_irqsave(&io_request_lock, flags);
 891         rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
 892         spin_unlock_irqrestore(&io_request_lock, flags);
 893
 894         if (rtn == SUCCESS)
 895                 SCpnt->eh_state = SUCCESS;
 896
 897         /*
 898          * If we had a successful host reset, mark the command blocks to expect
 899          * a condition code of unit attention.
 900          */
 901         scsi_sleep(HOST_RESET_SETTLE_TIME);
 902         if (SCpnt->eh_state == SUCCESS) {
 903                 Scsi_Device *SDloop;
 904                 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
 905                         SDloop->was_reset = 1;
 906                         SDloop->expecting_cc_ua = 1;
 907                 }
 908         }
 909         return SCpnt->eh_state;
 910 }
 911
 912 /*
 913  * Function:  scsi_decide_disposition
 914  *
 915  * Purpose:     Examine a command block that has come back from the low-level
 916  *              and figure out what to do next.
 917  *
 918  * Returns:     SUCCESS         - pass on to upper level.
 919  *              FAILED          - pass on to error handler thread.
 920  *              RETRY           - command should be retried.
 921  *              SOFTERR         - command succeeded, but we need to log
 922  *                                a soft error.
 923  *
 924  * Notes:       This is *ONLY* called when we are examining the status
 925  *              after sending out the actual data command.  Any commands
 926  *              that are queued for error recovery (i.e. TEST_UNIT_READY)
 927  *              do *NOT* come through here.
 928  *
 929  *              NOTE - When this routine returns FAILED, it means the error
 930  *              handler thread is woken.  In cases where the error code
 931  *              indicates an error that doesn't require the error handler
 932  *              thread (i.e. we don't need to abort/reset), then this function
 933  *              should return SUCCESS.
 934  */
 935 int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
 936 {
 937         int rtn;
 938
 939         /*
 940          * If the device is offline, then we clearly just pass the result back
 941          * up to the top level.
 942          */
 943         if (SCpnt->device->online == FALSE) {
 944                 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
 945                 return SUCCESS;
 946         }
 947         /*
 948          * First check the host byte, to see if there is anything in there
 949          * that would indicate what we need to do.
 950          */
 951
 952         switch (host_byte(SCpnt->result)) {
 953         case DID_PASSTHROUGH:
 954                 /*
 955                  * No matter what, pass this through to the upper layer.
 956                  * Nuke this special code so that it looks like we are saying
 957                  * DID_OK.
 958                  */
 959                 SCpnt->result &= 0xff00ffff;
 960                 return SUCCESS;
 961         case DID_OK:
 962                 /*
 963                  * Looks good.  Drop through, and check the next byte.
 964                  */
 965                 break;
 966         case DID_NO_CONNECT:
 967         case DID_BAD_TARGET:
 968         case DID_ABORT:
 969                 /*
 970                  * Note - this means that we just report the status back to the
 971                  * top level driver, not that we actually think that it indicates
 972                  * success.
 973                  */
 974                 return SUCCESS;
 975                 /*
 976                  * When the low level driver returns DID_SOFT_ERROR,
 977                  * it is responsible for keeping an internal retry counter
 978                  * in order to avoid endless loops (DB)
 979                  *
 980                  * Actually this is a bug in this function here.  We should
 981                  * be mindful of the maximum number of retries specified
 982                  * and not get stuck in a loop.
 983                  */
 984         case DID_SOFT_ERROR:
 985                 goto maybe_retry;
 986
 987         case DID_ERROR:
 988                 if (msg_byte(SCpnt->result) == COMMAND_COMPLETE &&
 989                     status_byte(SCpnt->result) == RESERVATION_CONFLICT)
 990                         /*
 991                          * execute reservation conflict processing code
 992                          * lower down
 993                          */
 994                         break;
 995                 /* FALLTHROUGH */
 996
 997         case DID_BUS_BUSY:
 998         case DID_PARITY:
 999                 goto maybe_retry;
1000         case DID_TIME_OUT:
1001                 /*
1002                  * When we scan the bus, we get timeout messages for
1003                  * these commands if there is no device available.
1004                  * Other hosts report DID_NO_CONNECT for the same thing.
1005                  */
1006                 if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
1007                      SCpnt->cmnd[0] == INQUIRY)) {
1008                         return SUCCESS;
1009                 } else {
1010                         return FAILED;
1011                 }
1012         case DID_RESET:
1013                 /*
1014                  * In the normal case where we haven't initiated a reset, this is
1015                  * a failure.
1016                  */
1017                 if (SCpnt->flags & IS_RESETTING) {
1018                         SCpnt->flags &= ~IS_RESETTING;
1019                         goto maybe_retry;
1020                 }
1021                 return SUCCESS;
1022         default:
1023                 return FAILED;
1024         }
1025
1026         /*
1027          * Next, check the message byte.
1028          */
1029         if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1030                 return FAILED;
1031         }
1032         /*
1033          * Now, check the status byte to see if this indicates anything special.
1034          */
1035         switch (status_byte(SCpnt->result)) {
1036         case QUEUE_FULL:
1037                 /*
1038                  * The case of trying to send too many commands to a tagged queueing
1039                  * device.
1040                  */
1041                 return ADD_TO_MLQUEUE;
1042         case GOOD:
1043         case COMMAND_TERMINATED:
1044                 return SUCCESS;
1045         case CHECK_CONDITION:
1046                 rtn = scsi_check_sense(SCpnt);
1047                 if (rtn == NEEDS_RETRY) {
1048                         goto maybe_retry;
1049                 }
1050                 return rtn;
1051         case CONDITION_GOOD:
1052         case INTERMEDIATE_GOOD:
1053         case INTERMEDIATE_C_GOOD:
1054                 /*
1055                  * Who knows?  FIXME(eric)
1056                  */
1057                 return SUCCESS;
1058         case BUSY:
1059                 goto maybe_retry;
1060
1061         case RESERVATION_CONFLICT:
1062                 printk("scsi%d (%d,%d,%d) : RESERVATION CONFLICT\n",
1063                        SCpnt->host->host_no, SCpnt->channel,
1064                        SCpnt->device->id, SCpnt->device->lun);
1065                 return SUCCESS; /* causes immediate I/O error */
1066         default:
1067                 return FAILED;
1068         }
1069         return FAILED;
1070
1071       maybe_retry:
1072
1073         if ((++SCpnt->retries) < SCpnt->allowed) {
1074                 return NEEDS_RETRY;
1075         } else {
1076                 /*
1077                  * No more retries - report this one back to upper level.
1078                  */
1079                 return SUCCESS;
1080         }
1081 }
1082
1083 /*
1084  * Function:  scsi_eh_completed_normally
1085  *
1086  * Purpose:     Examine a command block that has come back from the low-level
1087  *              and figure out what to do next.
1088  *
1089  * Returns:     SUCCESS         - pass on to upper level.
1090  *              FAILED          - pass on to error handler thread.
1091  *              RETRY           - command should be retried.
1092  *              SOFTERR         - command succeeded, but we need to log
1093  *                                a soft error.
1094  *
1095  * Notes:       This is *ONLY* called when we are examining the status
1096  *              of commands queued during error recovery.  The main
1097  *              difference here is that we don't allow for the possibility
1098  *              of retries here, and we are a lot more restrictive about what
1099  *              we consider acceptable.
1100  */
1101 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
1102 {
1103         /*
1104          * First check the host byte, to see if there is anything in there
1105          * that would indicate what we need to do.
1106          */
1107         if (host_byte(SCpnt->result) == DID_RESET) {
1108                 if (SCpnt->flags & IS_RESETTING) {
1109                         /*
1110                          * OK, this is normal.  We don't know whether in fact the
1111                          * command in question really needs to be rerun or not -
1112                          * if this was the original data command then the answer is yes,
1113                          * otherwise we just flag it as success.
1114                          */
1115                         SCpnt->flags &= ~IS_RESETTING;
1116                         return NEEDS_RETRY;
1117                 }
1118                 /*
1119                  * Rats.  We are already in the error handler, so we now get to try
1120                  * and figure out what to do next.  If the sense is valid, we have
1121                  * a pretty good idea of what to do.  If not, we mark it as failed.
1122                  */
1123                 return scsi_check_sense(SCpnt);
1124         }
1125         if (host_byte(SCpnt->result) != DID_OK) {
1126                 return FAILED;
1127         }
1128         /*
1129          * Next, check the message byte.
1130          */
1131         if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1132                 return FAILED;
1133         }
1134         /*
1135          * Now, check the status byte to see if this indicates anything special.
1136          */
1137         switch (status_byte(SCpnt->result)) {
1138         case GOOD:
1139         case COMMAND_TERMINATED:
1140                 return SUCCESS;
1141         case CHECK_CONDITION:
1142                 return scsi_check_sense(SCpnt);
1143         case CONDITION_GOOD:
1144         case INTERMEDIATE_GOOD:
1145         case INTERMEDIATE_C_GOOD:
1146                 /*
1147                  * Who knows?  FIXME(eric)
1148                  */
1149                 return SUCCESS;
1150         case BUSY:
1151         case QUEUE_FULL:
1152         case RESERVATION_CONFLICT:
1153         default:
1154                 return FAILED;
1155         }
1156         return FAILED;
1157 }
1158
1159 /*
1160  * Function:  scsi_check_sense
1161  *
1162  * Purpose:     Examine sense information - give suggestion as to what
1163  *              we should do with it.
1164  */
1165 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
1166 {
1167         if (!scsi_sense_valid(SCpnt)) {
1168                 return FAILED;
1169         }
1170         if (SCpnt->sense_buffer[2] & 0xe0)
1171                 return SUCCESS;
1172
1173         switch (SCpnt->sense_buffer[2] & 0xf) {
1174         case NO_SENSE:
1175                 return SUCCESS;
1176         case RECOVERED_ERROR:
1177                 return /* SOFT_ERROR */ SUCCESS;
1178
1179         case ABORTED_COMMAND:
1180                 return NEEDS_RETRY;
1181         case NOT_READY:
1182         case UNIT_ATTENTION:
1183                 /*
1184                  * If we are expecting a CC/UA because of a bus reset that we
1185                  * performed, treat this just as a retry.  Otherwise this is
1186                  * information that we should pass up to the upper-level driver
1187                  * so that we can deal with it there.
1188                  */
1189                 if (SCpnt->device->expecting_cc_ua) {
1190                         SCpnt->device->expecting_cc_ua = 0;
1191                         return NEEDS_RETRY;
1192                 }
1193                 /*
1194                  * If the device is in the process of becoming ready, we
1195                  * should retry.
1196                  */
1197                 if ((SCpnt->sense_buffer[12] == 0x04) &&
1198                         (SCpnt->sense_buffer[13] == 0x01)) {
1199                         return NEEDS_RETRY;
1200                 }
1201                 return SUCCESS;
1202
1203                 /* these three are not supported */
1204         case COPY_ABORTED:
1205         case VOLUME_OVERFLOW:
1206         case MISCOMPARE:
1207                 return SUCCESS;
1208
1209         case MEDIUM_ERROR:
1210                 return NEEDS_RETRY;
1211
1212         case ILLEGAL_REQUEST:
1213         case BLANK_CHECK:
1214         case DATA_PROTECT:
1215         case HARDWARE_ERROR:
1216         default:
1217                 return SUCCESS;
1218         }
1219 }
1220
1221
1222 /*
1223  * Function:  scsi_restart_operations
1224  *
1225  * Purpose:     Restart IO operations to the specified host.
1226  *
1227  * Arguments:   host  - host that we are restarting
1228  *
1229  * Lock status: Assumed that locks are not held upon entry.
1230  *
1231  * Returns:     Nothing
1232  *
1233  * Notes:       When we entered the error handler, we blocked all further
1234  *              I/O to this device.  We need to 'reverse' this process.
1235  */
1236 STATIC void scsi_restart_operations(struct Scsi_Host *host)
1237 {
1238         Scsi_Device *SDpnt;
1239         unsigned long flags;
1240
1241         ASSERT_LOCK(&io_request_lock, 0);
1242
1243         /*
1244          * Next free up anything directly waiting upon the host.  This will be
1245          * requests for character device operations, and also for ioctls to queued
1246          * block devices.
1247          */
1248         SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1249
1250         wake_up(&host->host_wait);
1251
1252         /*
1253          * Finally we need to re-initiate requests that may be pending.  We will
1254          * have had everything blocked while error handling is taking place, and
1255          * now that error recovery is done, we will need to ensure that these
1256          * requests are started.
1257          */
1258         spin_lock_irqsave(&io_request_lock, flags);
1259         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1260                 request_queue_t *q;
1261                 if ((host->can_queue > 0 && (host->host_busy >= host->can_queue))
1262                     || (host->host_blocked)
1263                     || (host->host_self_blocked)
1264                     || (SDpnt->device_blocked)) {
1265                         break;
1266                 }
1267                 q = &SDpnt->request_queue;
1268                 q->request_fn(q);
1269         }
1270         spin_unlock_irqrestore(&io_request_lock, flags);
1271 }
1272
1273 /*
1274  * Function:  scsi_unjam_host
1275  *
1276  * Purpose:     Attempt to fix a host which has a command that failed for
1277  *              some reason.
1278  *
1279  * Arguments:   host    - host that needs unjamming.
1280  *
1281  * Returns:     Nothing
1282  *
1283  * Notes:       When we come in here, we *know* that all commands on the
1284  *              bus have either completed, failed or timed out.  We also
1285  *              know that no further commands are being sent to the host,
1286  *              so things are relatively quiet and we have freedom to
1287  *              fiddle with things as we wish.
1288  *
1289  * Additional note:  This is only the *default* implementation.  It is possible
1290  *              for individual drivers to supply their own version of this
1291  *              function, and if the maintainer wishes to do this, it is
1292  *              strongly suggested that this function be taken as a template
1293  *              and modified.  This function was designed to correctly handle
1294  *              problems for about 95% of the different cases out there, and
1295  *              it should always provide at least a reasonable amount of error
1296  *              recovery.
1297  *
1298  * Note3:       Any command marked 'FAILED' or 'TIMEOUT' must eventually
1299  *              have scsi_finish_command() called for it.  We do all of
1300  *              the retry stuff here, so when we restart the host after we
1301  *              return it should have an empty queue.
1302  */
1303 STATIC int scsi_unjam_host(struct Scsi_Host *host)
1304 {
1305         int devices_failed;
1306         int numfailed;
1307         int ourrtn;
1308         int rtn = FALSE;
1309         int result;
1310         Scsi_Cmnd *SCloop;
1311         Scsi_Cmnd *SCpnt;
1312         Scsi_Device *SDpnt;
1313         Scsi_Device *SDloop;
1314         Scsi_Cmnd *SCdone;
1315         int timed_out;
1316
1317         ASSERT_LOCK(&io_request_lock, 0);
1318
1319         SCdone = NULL;
1320
1321         /*
1322          * First, protect against any sort of race condition.  If any of the outstanding
1323          * commands are in states that indicate that we are not yet blocked (i.e. we are
1324          * not in a quiet state) then we got woken up in error.  If we ever end up here,
1325          * we need to re-examine some of the assumptions.
1326          */
1327         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1328                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1329                         if (SCpnt->state == SCSI_STATE_FAILED
1330                             || SCpnt->state == SCSI_STATE_TIMEOUT
1331                             || SCpnt->state == SCSI_STATE_INITIALIZING
1332                             || SCpnt->state == SCSI_STATE_UNUSED) {
1333                                 continue;
1334                         }
1335                         /*
1336                          * Rats.  Something is still floating around out there.  This could
1337                          * be the result of the fact that the upper level drivers are still frobbing
1338                          * commands that might have succeeded.  There are two outcomes.  One is that
1339                          * the command block will eventually be freed, and the other one is that
1340                          * the command will be queued and will be finished along the way.
1341                          */
1342                         SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
1343
1344 /*
1345  *        panic("SCSI Error handler woken too early\n");
1346  *
1347  * This is no longer a problem, since now the code cares only about
1348  * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1349  * Other states are useful only to release active commands when devices are
1350  * set offline. If (host->host_active == host->host_busy) we can safely assume
1351  * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1352  *
1353  * FIXME:
1354  * It is not easy to release correctly commands according to their state when
1355  * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1356  * When a device is set offline, we can have some command with
1357  * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
1358  * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1359  * (DB, 17 May 1998)
1360  */
1361                 }
1362         }
1363
1364         /*
1365          * Next, see if we need to request sense information.  if so,
1366          * then get it now, so we have a better idea of what to do.
1367          * FIXME(eric) this has the unfortunate side effect that if a host
1368          * adapter does not automatically request sense information, that we end
1369          * up shutting it down before we request it.  All hosts should be doing this
1370          * anyways, so for now all I have to say is tough noogies if you end up in here.
1371          * On second thought, this is probably a good idea.  We *really* want to give
1372          * authors an incentive to automatically request this.
1373          */
1374         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1375
1376         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1377                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1378                         if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
1379                                 continue;
1380                         }
1381                         SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1382                                                           SCpnt->target));
1383                         rtn = scsi_request_sense(SCpnt);
1384                         if (rtn != SUCCESS) {
1385                                 continue;
1386                         }
1387                         SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1388                                                   SCpnt, SCpnt->result));
1389                         SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
1390
1391                         result = scsi_decide_disposition(SCpnt);
1392
1393                         /*
1394                          * If the result was normal, then just pass it along to the
1395                          * upper level.
1396                          */
1397                         if (result == SUCCESS) {
1398                                 SCpnt->host->host_failed--;
1399                                 scsi_eh_finish_command(&SCdone, SCpnt);
1400                         }
1401                         if (result != NEEDS_RETRY) {
1402                                 continue;
1403                         }
1404                         /*
1405                          * We only come in here if we want to retry a
1406                          * command.  The test to see whether the command
1407                          * should be retried should be keeping track of the
1408                          * number of tries, so we don't end up looping, of
1409                          * course.
1410                          */
1411                         SCpnt->state = NEEDS_RETRY;
1412                         rtn = scsi_eh_retry_command(SCpnt);
1413                         if (rtn != SUCCESS) {
1414                                 continue;
1415                         }
1416                         /*
1417                          * We eventually hand this one back to the top level.
1418                          */
1419                         SCpnt->host->host_failed--;
1420                         scsi_eh_finish_command(&SCdone, SCpnt);
1421                 }
1422         }
1423
1424         /*
1425          * Go through the list of commands and figure out where we stand and how bad things
1426          * really are.
1427          */
1428         numfailed = 0;
1429         timed_out = 0;
1430         devices_failed = 0;
1431         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1432                 unsigned int device_error = 0;
1433
1434                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1435                         if (SCpnt->state == SCSI_STATE_FAILED) {
1436                                 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1437                                                          SCpnt->target));
1438                                 numfailed++;
1439                                 device_error++;
1440                         }
1441                         if (SCpnt->state == SCSI_STATE_TIMEOUT) {
1442                                 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1443                                                          SCpnt->target));
1444                                 timed_out++;
1445                                 device_error++;
1446                         }
1447                 }
1448                 if (device_error > 0) {
1449                         devices_failed++;
1450                 }
1451         }
1452
1453         SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1454                                   numfailed, timed_out, devices_failed));
1455
1456         if (host->host_failed == 0) {
1457                 ourrtn = TRUE;
1458                 goto leave;
1459         }
1460         /*
1461          * Next, try and see whether or not it makes sense to try and abort
1462          * the running command.  This only works out to be the case if we have
1463          * one command that has timed out.  If the command simply failed, it
1464          * makes no sense to try and abort the command, since as far as the
1465          * host adapter is concerned, it isn't running.
1466          */
1467
1468         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1469
1470         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1471                 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1472                         if (SCloop->state != SCSI_STATE_TIMEOUT) {
1473                                 continue;
1474                         }
1475                         rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
1476                         if (rtn == SUCCESS) {
1477                                 rtn = scsi_test_unit_ready(SCloop);
1478
1479                                 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1480                                         rtn = scsi_eh_retry_command(SCloop);
1481
1482                                         if (rtn == SUCCESS) {
1483                                                 SCloop->host->host_failed--;
1484                                                 scsi_eh_finish_command(&SCdone, SCloop);
1485                                         }
1486                                 }
1487                         }
1488                 }
1489         }
1490
1491         /*
1492          * If we have corrected all of the problems, then we are done.
1493          */
1494         if (host->host_failed == 0) {
1495                 ourrtn = TRUE;
1496                 goto leave;
1497         }
1498         /*
1499          * Either the abort wasn't appropriate, or it didn't succeed.
1500          * Now try a bus device reset.  Still, look to see whether we have
1501          * multiple devices that are jammed or not - if we have multiple devices,
1502          * it makes no sense to try BUS_DEVICE_RESET - we really would need
1503          * to try a BUS_RESET instead.
1504          *
1505          * Does this make sense - should we try BDR on each device individually?
1506          * Yes, definitely.
1507          */
1508         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1509
1510         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1511                 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1512                         if (SCloop->state == SCSI_STATE_FAILED
1513                             || SCloop->state == SCSI_STATE_TIMEOUT) {
1514                                 break;
1515                         }
1516                 }
1517
1518                 if (SCloop == NULL) {
1519                         continue;
1520                 }
1521                 /*
1522                  * OK, we have a device that is having problems.  Try and send
1523                  * a bus device reset to it.
1524                  *
1525                  * FIXME(eric) - make sure we handle the case where multiple
1526                  * commands to the same device have failed. They all must
1527                  * get properly restarted.
1528                  */
1529                 rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
1530
1531                 if (rtn == SUCCESS) {
1532                         rtn = scsi_test_unit_ready(SCloop);
1533
1534                         if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1535                                 rtn = scsi_eh_retry_command(SCloop);
1536
1537                                 if (rtn == SUCCESS) {
1538                                         SCloop->host->host_failed--;
1539                                         scsi_eh_finish_command(&SCdone, SCloop);
1540                                 }
1541                         }
1542                 }
1543         }
1544
1545         if (host->host_failed == 0) {
1546                 ourrtn = TRUE;
1547                 goto leave;
1548         }
1549         /*
1550          * If we ended up here, we have serious problems.  The only thing left
1551          * to try is a full bus reset.  If someone has grabbed the bus and isn't
1552          * letting go, then perhaps this will help.
1553          */
1554         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1555
1556         /*
1557          * We really want to loop over the various channels, and do this on
1558          * a channel by channel basis.  We should also check to see if any
1559          * of the failed commands are on soft_reset devices, and if so, skip
1560          * the reset.
1561          */
1562         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1563               next_device:
1564                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1565                         if (SCpnt->state != SCSI_STATE_FAILED
1566                             && SCpnt->state != SCSI_STATE_TIMEOUT) {
1567                                 continue;
1568                         }
1569                         /*
1570                          * We have a failed command.  Make sure there are no other failed
1571                          * commands on the same channel that are timed out and implement a
1572                          * soft reset.
1573                          */
1574                         for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1575                                 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1576                                         if (SCloop->channel != SCpnt->channel) {
1577                                                 continue;
1578                                         }
1579                                         if (SCloop->state != SCSI_STATE_FAILED
1580                                             && SCloop->state != SCSI_STATE_TIMEOUT) {
1581                                                 continue;
1582                                         }
1583                                         if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
1584                                                 /*
1585                                                  * If this device uses the soft reset option, and this
1586                                                  * is one of the devices acting up, then our only
1587                                                  * option is to wait a bit, since the command is
1588                                                  * supposedly still running.
1589                                                  *
1590                                                  * FIXME(eric) - right now we will just end up falling
1591                                                  * through to the 'take device offline' case.
1592                                                  *
1593                                                  * FIXME(eric) - It is possible that the command completed
1594                                                  * *after* the error recovery procedure started, and if this
1595                                                  * is the case, we are worrying about nothing here.
1596                                                  */
1597
1598                                                 scsi_sleep(1 * HZ);
1599                                                 goto next_device;
1600                                         }
1601                                 }
1602                         }
1603
1604                         /*
1605                          * We now know that we are able to perform a reset for the
1606                          * bus that SCpnt points to.  There are no soft-reset devices
1607                          * with outstanding timed out commands.
1608                          */
1609                         rtn = scsi_try_bus_reset(SCpnt);
1610                         if (rtn == SUCCESS) {
1611                                 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1612                                         for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1613                                                 if (SCloop->channel != SCpnt->channel) {
1614                                                         continue;
1615                                                 }
1616                                                 if (SCloop->state != SCSI_STATE_FAILED
1617                                                     && SCloop->state != SCSI_STATE_TIMEOUT) {
1618                                                         continue;
1619                                                 }
1620                                                 rtn = scsi_test_unit_ready(SCloop);
1621
1622                                                 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1623                                                         rtn = scsi_eh_retry_command(SCloop);
1624
1625                                                         if (rtn == SUCCESS) {
1626                                                                 SCpnt->host->host_failed--;
1627                                                                 scsi_eh_finish_command(&SCdone, SCloop);
1628                                                         }
1629                                                 }
1630                                                 /*
1631                                                  * If the bus reset worked, but we are still unable to
1632                                                  * talk to the device, take it offline.
1633                                                  * FIXME(eric) - is this really the correct thing to do?
1634                                                  */
1635                                                 if (rtn != SUCCESS) {
1636                                                         printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after bus reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1637
1638                                                         SDloop->online = FALSE;
1639                                                         SDloop->host->host_failed--;
1640                                                         scsi_eh_finish_command(&SCdone, SCloop);
1641                                                 }
1642                                         }
1643                                 }
1644                         }
1645                 }
1646         }
1647
1648         if (host->host_failed == 0) {
1649                 ourrtn = TRUE;
1650                 goto leave;
1651         }
1652         /*
1653          * If we ended up here, we have serious problems.  The only thing left
1654          * to try is a full host reset - perhaps the firmware on the device
1655          * crashed, or something like that.
1656          *
1657          * It is assumed that a succesful host reset will cause *all* information
1658          * about the command to be flushed from both the host adapter *and* the
1659          * device.
1660          *
1661          * FIXME(eric) - it isn't clear that devices that implement the soft reset
1662          * option can ever be cleared except via cycling the power.  The problem is
1663          * that sending the host reset command will cause the host to forget
1664          * about the pending command, but the device won't forget.  For now, we
1665          * skip the host reset option if any of the failed devices are configured
1666          * to use the soft reset option.
1667          */
1668         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1669               next_device2:
1670                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1671                         if (SCpnt->state != SCSI_STATE_FAILED
1672                             && SCpnt->state != SCSI_STATE_TIMEOUT) {
1673                                 continue;
1674                         }
1675                         if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
1676                                 /*
1677                                  * If this device uses the soft reset option, and this
1678                                  * is one of the devices acting up, then our only
1679                                  * option is to wait a bit, since the command is
1680                                  * supposedly still running.
1681                                  *
1682                                  * FIXME(eric) - right now we will just end up falling
1683                                  * through to the 'take device offline' case.
1684                                  */
1685                                 SCSI_LOG_ERROR_RECOVERY(3,
1686                                                         printk("scsi_unjam_host: Unable to try hard host reset\n"));
1687
1688                                 /*
1689                                  * Due to the spinlock, we will never get out of this
1690                                  * loop without a proper wait. (DB)
1691                                  */
1692                                 scsi_sleep(1 * HZ);
1693
1694                                 goto next_device2;
1695                         }
1696                         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1697
1698                         /*
1699                          * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1700                          */
1701                         rtn = scsi_try_host_reset(SCpnt);
1702                         if (rtn == SUCCESS) {
1703                                 /*
1704                                  * FIXME(eric) we assume that all commands are flushed from the
1705                                  * controller.  We should get a DID_RESET for all of the commands
1706                                  * that were pending.  We should ignore these so that we can
1707                                  * guarantee that we are in a consistent state.
1708                                  *
1709                                  * I believe this to be the case right now, but this needs to be
1710                                  * tested.
1711                                  */
1712                                 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1713                                         for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1714                                                 if (SCloop->state != SCSI_STATE_FAILED
1715                                                     && SCloop->state != SCSI_STATE_TIMEOUT) {
1716                                                         continue;
1717                                                 }
1718                                                 rtn = scsi_test_unit_ready(SCloop);
1719
1720                                                 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1721                                                         rtn = scsi_eh_retry_command(SCloop);
1722
1723                                                         if (rtn == SUCCESS) {
1724                                                                 SCpnt->host->host_failed--;
1725                                                                 scsi_eh_finish_command(&SCdone, SCloop);
1726                                                         }
1727                                                 }
1728                                                 if (rtn != SUCCESS) {
1729                                                         printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after host reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1730                                                         SDloop->online = FALSE;
1731                                                         SDloop->host->host_failed--;
1732                                                         scsi_eh_finish_command(&SCdone, SCloop);
1733                                                 }
1734                                         }
1735                                 }
1736                         }
1737                 }
1738         }
1739
1740         /*
1741          * If we solved all of the problems, then let's rev up the engines again.
1742          */
1743         if (host->host_failed == 0) {
1744                 ourrtn = TRUE;
1745                 goto leave;
1746         }
1747         /*
1748          * If the HOST RESET failed, then for now we assume that the entire host
1749          * adapter is too hosed to be of any use.  For our purposes, however, it is
1750          * easier to simply take the devices offline that correspond to commands
1751          * that failed.
1752          */
1753         SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1754
1755         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1756                 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1757                         if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
1758                                 SDloop = SCloop->device;
1759                                 if (SDloop->online == TRUE) {
1760                                         printk(KERN_INFO "scsi: device set offline - command error recover failed: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1761                                         SDloop->online = FALSE;
1762                                 }
1763
1764                                 /*
1765                                  * This should pass the failure up to the top level driver, and
1766                                  * it will have to try and do something intelligent with it.
1767                                  */
1768                                 SCloop->host->host_failed--;
1769
1770                                 if (SCloop->state == SCSI_STATE_TIMEOUT) {
1771                                         SCloop->result |= (DRIVER_TIMEOUT << 24);
1772                                 }
1773                                 SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1774                                     SDloop->id, SCloop->result));
1775
1776                                 scsi_eh_finish_command(&SCdone, SCloop);
1777                         }
1778                 }
1779         }
1780
1781         if (host->host_failed != 0) {
1782                 panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1783         }
1784         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1785
1786         ourrtn = FALSE;
1787
1788       leave:
1789
1790         /*
1791          * We should have a list of commands that we 'finished' during the course of
1792          * error recovery.  This should be the same as the list of commands that timed out
1793          * or failed.  We are currently holding these things in a linked list - we didn't
1794          * put them in the bottom half queue because we wanted to keep things quiet while
1795          * we were working on recovery, and passing them up to the top level could easily
1796          * cause the top level to try and queue something else again.
1797          *
1798          * Start by marking that the host is no longer in error recovery.
1799          */
1800         host->in_recovery = 0;
1801
1802         /*
1803          * Take the list of commands, and stick them in the bottom half queue.
1804          * The current implementation of scsi_done will do this for us - if need
1805          * be we can create a special version of this function to do the
1806          * same job for us.
1807          */
1808         for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
1809                 SCdone = SCpnt->bh_next;
1810                 SCpnt->bh_next = NULL;
1811                 /*
1812                  * Oh, this is a vile hack.  scsi_done() expects a timer
1813                  * to be running on the command.  If there isn't, it assumes
1814                  * that the command has actually timed out, and a timer
1815                  * handler is running.  That may well be how we got into
1816                  * this fix, but right now things are stable.  We add
1817                  * a timer back again so that we can report completion.
1818                  * scsi_done() will immediately remove said timer from
1819                  * the command, and then process it.
1820                  */
1821                 scsi_add_timer(SCpnt, 100, scsi_eh_times_out);
1822                 scsi_done(SCpnt);
1823         }
1824
1825         return (ourrtn);
1826 }
1827
1828
1829 /*
1830  * Function:  scsi_error_handler
1831  *
1832  * Purpose:     Handle errors/timeouts of scsi commands, try and clean up
1833  *              and unjam the bus, and restart things.
1834  *
1835  * Arguments:   host    - host for which we are running.
1836  *
1837  * Returns:     Never returns.
1838  *
1839  * Notes:       This is always run in the context of a kernel thread.  The
1840  *              idea is that we start this thing up when the kernel starts
1841  *              up (one per host that we detect), and it immediately goes to
1842  *              sleep and waits for some event (i.e. failure).  When this
1843  *              takes place, we have the job of trying to unjam the bus
1844  *              and restarting things.
1845  *
1846  */
1847 void scsi_error_handler(void *data)
1848 {
1849         struct Scsi_Host *host = (struct Scsi_Host *) data;
1850         int rtn;
1851         DECLARE_MUTEX_LOCKED(sem);
1852
1853         /*
1854          * We only listen to signals if the HA was loaded as a module.
1855          * If the HA was compiled into the kernel, then we don't listen
1856          * to any signals.
1857          */
1858         if( host->loaded_as_module ) {
1859         siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
1860         } else {
1861         siginitsetinv(&current->blocked, 0);
1862         }
1863
1864         lock_kernel();
1865
1866         /*
1867          *    Flush resources
1868          */
1869
1870         daemonize();
1871         reparent_to_init();
1872
1873         /*
1874          * Set the name of this process.
1875          */
1876
1877         sprintf(current->comm, "scsi_eh_%d", host->host_no);
1878
1879         host->eh_wait = &sem;
1880         host->ehandler = current;
1881
1882         unlock_kernel();
1883
1884         /*
1885          * Wake up the thread that created us.
1886          */
1887         SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", sem_getcount(host->eh_notify)));
1888
1889         up(host->eh_notify);
1890
1891         while (1) {
1892                 /*
1893                  * If we get a signal, it means we are supposed to go
1894                  * away and die.  This typically happens if the user is
1895                  * trying to unload a module.
1896                  */
1897                 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1898
1899                 /*
1900                  * Note - we always use down_interruptible with the semaphore
1901                  * even if the module was loaded as part of the kernel.  The
1902                  * reason is that down() will cause this thread to be counted
1903                  * in the load average as a running process, and down
1904                  * interruptible doesn't.  Given that we need to allow this
1905                  * thread to die if the driver was loaded as a module, using
1906                  * semaphores isn't unreasonable.
1907                  */
1908                 down_interruptible(&sem);
1909                 if( host->loaded_as_module ) {
1910                         if (signal_pending(current))
1911                                 break;
1912                 }
1913
1914                 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1915
1916                 host->eh_active = 1;
1917
1918                 /*
1919                  * We have a host that is failing for some reason.  Figure out
1920                  * what we need to do to get it up and online again (if we can).
1921                  * If we fail, we end up taking the thing offline.
1922                  */
1923                 if (host->hostt->eh_strategy_handler != NULL) {
1924                         rtn = host->hostt->eh_strategy_handler(host);
1925                 } else {
1926                         rtn = scsi_unjam_host(host);
1927                 }
1928
1929                 host->eh_active = 0;
1930
1931                 /*
1932                  * Note - if the above fails completely, the action is to take
1933                  * individual devices offline and flush the queue of any
1934                  * outstanding requests that may have been pending.  When we
1935                  * restart, we restart any I/O to any other devices on the bus
1936                  * which are still online.
1937                  */
1938                 scsi_restart_operations(host);
1939
1940         }
1941
1942         SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1943
1944         /*
1945          * Make sure that nobody tries to wake us up again.
1946          */
1947         host->eh_wait = NULL;
1948
1949         /*
1950          * Knock this down too.  From this point on, the host is flying
1951          * without a pilot.  If this is because the module is being unloaded,
1952          * that's fine.  If the user sent a signal to this thing, we are
1953          * potentially in real danger.
1954          */
1955         host->in_recovery = 0;
1956         host->eh_active = 0;
1957         host->ehandler = NULL;
1958
1959         /*
1960          * If anyone is waiting for us to exit (i.e. someone trying to unload
1961          * a driver), then wake up that process to let them know we are on
1962          * the way out the door.  This may be overkill - I *think* that we
1963          * could probably just unload the driver and send the signal, and when
1964          * the error handling thread wakes up that it would just exit without
1965          * needing to touch any memory associated with the driver itself.
1966          */
1967         if (host->eh_notify != NULL)
1968                 up(host->eh_notify);
1969 }
1970
1971 /*
1972  * Function:    scsi_new_reset
1973  *
1974  * Purpose:     Send requested reset to a bus or device at any phase.
1975  *
1976  * Arguments:   SCpnt   - command ptr to send reset with (usually a dummy)
1977  *              flag - reset type (see scsi.h)
1978  *
1979  * Returns:     SUCCESS/FAILURE.
1980  *
1981  * Notes:       This is used by the SCSI Generic driver to provide
1982  *              Bus/Device reset capability.
1983  */
1984 int
1985 scsi_new_reset(Scsi_Cmnd *SCpnt, int flag)
1986 {
1987         int rtn;
1988
1989         switch(flag) {
1990         case SCSI_TRY_RESET_DEVICE:
1991                 rtn = scsi_try_bus_device_reset(SCpnt, 0);
1992                 if (rtn == SUCCESS)
1993                         break;
1994                 /* FALLTHROUGH */
1995         case SCSI_TRY_RESET_BUS:
1996                 rtn = scsi_try_bus_reset(SCpnt);
1997                 if (rtn == SUCCESS)
1998                         break;
1999                 /* FALLTHROUGH */
2000         case SCSI_TRY_RESET_HOST:
2001                 rtn = scsi_try_host_reset(SCpnt);
2002                 break;
2003         default:
2004                 rtn = FAILED;
2005         }
2006
2007         return rtn;
2008 }
2009
2010 /*
2011  * Overrides for Emacs so that we follow Linus's tabbing style.
2012  * Emacs will notice this stuff at the end of the file and automatically
2013  * adjust the settings for this buffer only.  This must remain at the end
2014  * of the file.
2015  * ---------------------------------------------------------------------------
2016  * Local variables:
2017  * c-indent-level: 4
2018  * c-brace-imaginary-offset: 0
2019  * c-brace-offset: -4
2020  * c-argdecl-indent: 4
2021  * c-label-offset: -4
2022  * c-continued-statement-offset: 4
2023  * c-continued-brace-offset: 0
2024  * indent-tabs-mode: nil
2025  * tab-width: 8
2026  * End:
2027  */