drivers/scsi/scsi_error.c

   1 /*
   2  *  scsi_error.c Copyright (C) 1997 Eric Youngdale
   3  *
   4  *  SCSI error/timeout handling
   5  *      Initial versions: Eric Youngdale.  Based upon conversations with
   6  *                        Leonard Zubkoff and David Miller at Linux Expo,
   7  *                        ideas originating from all over the place.
   8  *
   9  */
  10
  11 #define __NO_VERSION__
  12 #include <linux/module.h>
  13
  14 #include <linux/sched.h>
  15 #include <linux/timer.h>
  16 #include <linux/string.h>
  17 #include <linux/slab.h>
  18 #include <linux/ioport.h>
  19 #include <linux/kernel.h>
  20 #include <linux/stat.h>
  21 #include <linux/blk.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/delay.h>
  24 #include <linux/smp_lock.h>
  25
  26 #define __KERNEL_SYSCALLS__
  27
  28 #include <linux/unistd.h>
  29
  30 #include <asm/system.h>
  31 #include <asm/irq.h>
  32 #include <asm/dma.h>
  33
  34 #include "scsi.h"
  35 #include "hosts.h"
  36 #include "constants.h"
  37
  38 /*
  39  * We must always allow SHUTDOWN_SIGS.  Even if we are not a module,
  40  * the host drivers that we are using may be loaded as modules, and
  41  * when we unload these,  we need to ensure that the error handler thread
  42  * can be shut down.
  43  *
  44  * Note - when we unload a module, we send a SIGHUP.  We mustn't
  45  * enable SIGTERM, as this is how the init shuts things down when you
  46  * go to single-user mode.  For that matter, init also sends SIGKILL,
  47  * so we mustn't enable that one either.  We use SIGHUP instead.  Other
  48  * options would be SIGPWR, I suppose.
  49  */
  50 #define SHUTDOWN_SIGS   (sigmask(SIGHUP))
  51
  52 #ifdef DEBUG
  53 #define SENSE_TIMEOUT SCSI_TIMEOUT
  54 #define ABORT_TIMEOUT SCSI_TIMEOUT
  55 #define RESET_TIMEOUT SCSI_TIMEOUT
  56 #else
  57 #define SENSE_TIMEOUT (10*HZ)
  58 #define RESET_TIMEOUT (2*HZ)
  59 #define ABORT_TIMEOUT (15*HZ)
  60 #endif
  61
  62 #define STATIC
  63
  64 /*
  65  * These should *probably* be handled by the host itself.
  66  * Since it is allowed to sleep, it probably should.
  67  */
  68 #define BUS_RESET_SETTLE_TIME   5*HZ
  69 #define HOST_RESET_SETTLE_TIME  10*HZ
  70
  71
  72 static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
  73
  74 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
  75 STATIC int scsi_request_sense(Scsi_Cmnd *);
  76 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
  77 STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
  78 STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
  79 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
  80 STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
  81 STATIC int scsi_try_host_reset(Scsi_Cmnd *);
  82 STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
  83 STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
  84 STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
  85 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
  86 STATIC void scsi_restart_operations(struct Scsi_Host *);
  87 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
  88
  89
  90 /*
  91  * Function:    scsi_add_timer()
  92  *
  93  * Purpose:     Start timeout timer for a single scsi command.
  94  *
  95  * Arguments:   SCset   - command that is about to start running.
  96  *              timeout - amount of time to allow this command to run.
  97  *              complete - timeout function to call if timer isn't
  98  *                      canceled.
  99  *
 100  * Returns:     Nothing
 101  *
 102  * Notes:       This should be turned into an inline function.
 103  *
 104  * More Notes:  Each scsi command has it's own timer, and as it is added to
 105  *              the queue, we set up the timer.  When the command completes,
 106  *              we cancel the timer.  Pretty simple, really, especially
 107  *              compared to the old way of handling this crap.
 108  */
 109 void scsi_add_timer(Scsi_Cmnd * SCset,
 110                     int timeout,
 111                     void (*complete) (Scsi_Cmnd *))
 112 {
 113         SCset->eh_timeout.data = (unsigned long) SCset;
 114         SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
 115         mod_timer(&SCset->eh_timeout, jiffies + timeout);
 116
 117         SCset->done_late = 0;
 118
 119         SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
 120 }
 121
 122 /*
 123  * Function:    scsi_delete_timer()
 124  *
 125  * Purpose:     Delete/cancel timer for a given function.
 126  *
 127  * Arguments:   SCset   - command that we are canceling timer for.
 128  *
 129  * Returns:     1 if we were able to detach the timer.  0 if we
 130  *              blew it, and the timer function has already started
 131  *              to run.
 132  *
 133  * Notes:       This should be turned into an inline function.
 134  */
 135 int scsi_delete_timer(Scsi_Cmnd * SCset)
 136 {
 137         int rtn;
 138
 139         rtn = del_timer(&SCset->eh_timeout);
 140
 141         SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p %d\n", SCset, rtn));
 142
 143         SCset->eh_timeout.data = (unsigned long) NULL;
 144         SCset->eh_timeout.function = NULL;
 145
 146         return rtn;
 147 }
 148
 149 /*
 150  * Function:    scsi_times_out()
 151  *
 152  * Purpose:     Timeout function for normal scsi commands..
 153  *
 154  * Arguments:   SCpnt   - command that is timing out.
 155  *
 156  * Returns:     Nothing.
 157  *
 158  * Notes:       We do not need to lock this.  There is the potential for
 159  *              a race only in that the normal completion handling might
 160  *              run, but if the normal completion function determines
 161  *              that the timer has already fired, then it mustn't do
 162  *              anything.
 163  */
 164 void scsi_times_out(Scsi_Cmnd * SCpnt)
 165 {
 166         /*
 167          * Notify the low-level code that this operation failed and we are
 168          * reposessing the command.
 169          */
 170 #ifdef ERIC_neverdef
 171         /*
 172          * FIXME(eric)
 173          * Allow the host adapter to push a queue ordering tag
 174          * out to the bus to force the command in question to complete.
 175          * If the host wants to do this, then we just restart the timer
 176          * for the command.  Before we really do this, some real thought
 177          * as to the optimum way to handle this should be done.  We *do*
 178          * need to force ordering every so often to ensure that all requests
 179          * do eventually complete, but I am not sure if this is the best way
 180          * to actually go about it.
 181          *
 182          * Better yet, force a sync here, but don't block since we are in an
 183          * interrupt.
 184          */
 185         if (SCpnt->host->hostt->eh_ordered_queue_tag) {
 186                 if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
 187                         scsi_add_timer(SCpnt, SCpnt->internal_timeout,
 188                                        scsi_times_out);
 189                         return;
 190                 }
 191         }
 192         /*
 193          * FIXME(eric) - add a second special interface to handle this
 194          * case.  Ideally that interface can also be used to request
 195          * a queu
 196          */
 197         if (SCpnt->host->can_queue) {
 198                 SCpnt->host->hostt->queuecommand(SCpnt, NULL);
 199         }
 200 #endif
 201
 202         /* Set the serial_number_at_timeout to the current serial_number */
 203         SCpnt->serial_number_at_timeout = SCpnt->serial_number;
 204
 205         SCpnt->eh_state = FAILED;
 206         SCpnt->state = SCSI_STATE_TIMEOUT;
 207         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 208
 209         SCpnt->host->in_recovery = 1;
 210         SCpnt->host->host_failed++;
 211
 212         SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
 213                                    atomic_read(&SCpnt->host->host_active),
 214                                    SCpnt->host->host_busy,
 215                                    SCpnt->host->host_failed));
 216
 217         /*
 218          * If the host is having troubles, then look to see if this was the last
 219          * command that might have failed.  If so, wake up the error handler.
 220          */
 221         if( SCpnt->host->eh_wait == NULL ) {
 222                 panic("Error handler thread not present at %p %p %s %d",
 223                       SCpnt, SCpnt->host, __FILE__, __LINE__);
 224         }
 225         if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
 226                 up(SCpnt->host->eh_wait);
 227         }
 228 }
 229
 230 /*
 231  * Function     scsi_block_when_processing_errors
 232  *
 233  * Purpose:     Prevent more commands from being queued while error recovery
 234  *              is taking place.
 235  *
 236  * Arguments:   SDpnt - device on which we are performing recovery.
 237  *
 238  * Returns:     FALSE   The device was taken offline by error recovery.
 239  *              TRUE    OK to proceed.
 240  *
 241  * Notes:       We block until the host is out of error recovery, and then
 242  *              check to see whether the host or the device is offline.
 243  */
 244 int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
 245 {
 246
 247         SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
 248
 249         SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt->online));
 250
 251         return SDpnt->online;
 252 }
 253
 254 /*
 255  * Function:    scsi_eh_times_out()
 256  *
 257  * Purpose:     Timeout function for error handling.
 258  *
 259  * Arguments:   SCpnt   - command that is timing out.
 260  *
 261  * Returns:     Nothing.
 262  *
 263  * Notes:       During error handling, the kernel thread will be sleeping
 264  *              waiting for some action to complete on the device.  Our only
 265  *              job is to record that it timed out, and to wake up the
 266  *              thread.
 267  */
 268 STATIC
 269 void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
 270 {
 271         SCpnt->eh_state = SCSI_STATE_TIMEOUT;
 272         SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt));
 273
 274         if (SCpnt->host->eh_action != NULL)
 275                 up(SCpnt->host->eh_action);
 276         else
 277                 printk("Missing scsi error handler thread\n");
 278 }
 279
 280
 281 /*
 282  * Function:    scsi_eh_done()
 283  *
 284  * Purpose:     Completion function for error handling.
 285  *
 286  * Arguments:   SCpnt   - command that is timing out.
 287  *
 288  * Returns:     Nothing.
 289  *
 290  * Notes:       During error handling, the kernel thread will be sleeping
 291  *              waiting for some action to complete on the device.  Our only
 292  *              job is to record that the action completed, and to wake up the
 293  *              thread.
 294  */
 295 STATIC
 296 void scsi_eh_done(Scsi_Cmnd * SCpnt)
 297 {
 298         int     rtn;
 299
 300         /*
 301          * If the timeout handler is already running, then just set the
 302          * flag which says we finished late, and return.  We have no
 303          * way of stopping the timeout handler from running, so we must
 304          * always defer to it.
 305          */
 306         rtn = del_timer(&SCpnt->eh_timeout);
 307         if (!rtn) {
 308                 SCpnt->done_late = 1;
 309                 return;
 310         }
 311
 312         SCpnt->request.rq_status = RQ_SCSI_DONE;
 313
 314         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 315         SCpnt->eh_state = SUCCESS;
 316
 317         SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt,
 318                                           SCpnt->result));
 319
 320         if (SCpnt->host->eh_action != NULL)
 321                 up(SCpnt->host->eh_action);
 322 }
 323
 324 /*
 325  * Function:    scsi_eh_action_done()
 326  *
 327  * Purpose:     Completion function for error handling.
 328  *
 329  * Arguments:   SCpnt   - command that is timing out.
 330  *              answer  - boolean that indicates whether operation succeeded.
 331  *
 332  * Returns:     Nothing.
 333  *
 334  * Notes:       This callback is only used for abort and reset operations.
 335  */
 336 STATIC
 337 void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
 338 {
 339         SCpnt->request.rq_status = RQ_SCSI_DONE;
 340
 341         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 342         SCpnt->eh_state = (answer ? SUCCESS : FAILED);
 343
 344         if (SCpnt->host->eh_action != NULL)
 345                 up(SCpnt->host->eh_action);
 346 }
 347
 348 /*
 349  * Function:  scsi_sense_valid()
 350  *
 351  * Purpose:     Determine whether a host has automatically obtained sense
 352  *              information or not.  If we have it, then give a recommendation
 353  *              as to what we should do next.
 354  */
 355 int scsi_sense_valid(Scsi_Cmnd * SCpnt)
 356 {
 357         if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
 358                 return FALSE;
 359         }
 360         return TRUE;
 361 }
 362
 363 /*
 364  * Function:  scsi_eh_retry_command()
 365  *
 366  * Purpose:     Retry the original command
 367  *
 368  * Returns:     SUCCESS - we were able to get the sense data.
 369  *              FAILED  - we were not able to get the sense data.
 370  *
 371  * Notes:       This function will *NOT* return until the command either
 372  *              times out, or it completes.
 373  */
 374 STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
 375 {
 376         memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
 377                sizeof(SCpnt->data_cmnd));
 378         SCpnt->request_buffer = SCpnt->buffer;
 379         SCpnt->request_bufflen = SCpnt->bufflen;
 380         SCpnt->use_sg = SCpnt->old_use_sg;
 381         SCpnt->cmd_len = SCpnt->old_cmd_len;
 382         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 383         SCpnt->underflow = SCpnt->old_underflow;
 384
 385         scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
 386
 387         /*
 388          * Hey, we are done.  Let's look to see what happened.
 389          */
 390         return SCpnt->eh_state;
 391 }
 392
 393 /*
 394  * Function:  scsi_request_sense()
 395  *
 396  * Purpose:     Request sense data from a particular target.
 397  *
 398  * Returns:     SUCCESS - we were able to get the sense data.
 399  *              FAILED  - we were not able to get the sense data.
 400  *
 401  * Notes:       Some hosts automatically obtain this information, others
 402  *              require that we obtain it on our own.
 403  *
 404  *              This function will *NOT* return until the command either
 405  *              times out, or it completes.
 406  */
 407 STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
 408 {
 409         static unsigned char generic_sense[6] =
 410         {REQUEST_SENSE, 0, 0, 0, 255, 0};
 411         unsigned char scsi_result0[256], *scsi_result = NULL;
 412         int saved_result;
 413         int saved_resid;
 414
 415         ASSERT_LOCK(&io_request_lock, 0);
 416
 417         memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
 418                sizeof(generic_sense));
 419
 420         if (SCpnt->device->scsi_level <= SCSI_2)
 421                 SCpnt->cmnd[1] = SCpnt->lun << 5;
 422
 423         scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
 424             ? &scsi_result0[0] : kmalloc(512, GFP_ATOMIC | GFP_DMA);
 425
 426         if (scsi_result == NULL) {
 427                 printk("cannot allocate scsi_result in scsi_request_sense.\n");
 428                 return FAILED;
 429         }
 430         /*
 431          * Zero the sense buffer.  Some host adapters automatically always request
 432          * sense, so it is not a good idea that SCpnt->request_buffer and
 433          * SCpnt->sense_buffer point to the same address (DB).
 434          * 0 is not a valid sense code.
 435          */
 436         memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
 437         memset((void *) scsi_result, 0, 256);
 438
 439         saved_result = SCpnt->result;
 440         saved_resid = SCpnt->resid;
 441         SCpnt->request_buffer = scsi_result;
 442         SCpnt->request_bufflen = 256;
 443         SCpnt->use_sg = 0;
 444         SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
 445         SCpnt->sc_data_direction = SCSI_DATA_READ;
 446         SCpnt->underflow = 0;
 447
 448         scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
 449
 450         /* Last chance to have valid sense data */
 451         if (!scsi_sense_valid(SCpnt))
 452                 memcpy((void *) SCpnt->sense_buffer,
 453                        SCpnt->request_buffer,
 454                        sizeof(SCpnt->sense_buffer));
 455
 456         if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
 457                 kfree(scsi_result);
 458
 459         /*
 460          * When we eventually call scsi_finish, we really wish to complete
 461          * the original request, so let's restore the original data. (DB)
 462          */
 463         memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
 464                sizeof(SCpnt->data_cmnd));
 465         SCpnt->result = saved_result;
 466         SCpnt->resid = saved_resid;
 467         SCpnt->request_buffer = SCpnt->buffer;
 468         SCpnt->request_bufflen = SCpnt->bufflen;
 469         SCpnt->use_sg = SCpnt->old_use_sg;
 470         SCpnt->cmd_len = SCpnt->old_cmd_len;
 471         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 472         SCpnt->underflow = SCpnt->old_underflow;
 473
 474         /*
 475          * Hey, we are done.  Let's look to see what happened.
 476          */
 477         return SCpnt->eh_state;
 478 }
 479
 480 /*
 481  * Function:  scsi_test_unit_ready()
 482  *
 483  * Purpose:     Run test unit ready command to see if the device is talking to us or not.
 484  *
 485  */
 486 STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
 487 {
 488         static unsigned char tur_command[6] =
 489         {TEST_UNIT_READY, 0, 0, 0, 0, 0};
 490         int saved_resid;
 491
 492         memcpy((void *) SCpnt->cmnd, (void *) tur_command,
 493                sizeof(tur_command));
 494
 495         if (SCpnt->device->scsi_level <= SCSI_2)
 496                 SCpnt->cmnd[1] = SCpnt->lun << 5;
 497
 498         /*
 499          * Zero the sense buffer.  The SCSI spec mandates that any
 500          * untransferred sense data should be interpreted as being zero.
 501          */
 502         memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
 503
 504         saved_resid = SCpnt->resid;
 505         SCpnt->request_buffer = NULL;
 506         SCpnt->request_bufflen = 0;
 507         SCpnt->use_sg = 0;
 508         SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
 509         SCpnt->underflow = 0;
 510         SCpnt->sc_data_direction = SCSI_DATA_NONE;
 511
 512         scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
 513
 514         /*
 515          * When we eventually call scsi_finish, we really wish to complete
 516          * the original request, so let's restore the original data. (DB)
 517          */
 518         memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
 519                sizeof(SCpnt->data_cmnd));
 520         SCpnt->resid = saved_resid;
 521         SCpnt->request_buffer = SCpnt->buffer;
 522         SCpnt->request_bufflen = SCpnt->bufflen;
 523         SCpnt->use_sg = SCpnt->old_use_sg;
 524         SCpnt->cmd_len = SCpnt->old_cmd_len;
 525         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 526         SCpnt->underflow = SCpnt->old_underflow;
 527
 528         /*
 529          * Hey, we are done.  Let's look to see what happened.
 530          */
 531         SCSI_LOG_ERROR_RECOVERY(3,
 532                 printk("scsi_test_unit_ready: SCpnt %p eh_state %x\n",
 533                 SCpnt, SCpnt->eh_state));
 534         return SCpnt->eh_state;
 535 }
 536
 537 /*
 538  * This would normally need to get the IO request lock,
 539  * but as it doesn't actually touch anything that needs
 540  * to be locked we can avoid the lock here..
 541  */
 542 STATIC
 543 void scsi_sleep_done(struct semaphore *sem)
 544 {
 545         if (sem != NULL) {
 546                 up(sem);
 547         }
 548 }
 549
 550 void scsi_sleep(int timeout)
 551 {
 552         DECLARE_MUTEX_LOCKED(sem);
 553         struct timer_list timer;
 554
 555         init_timer(&timer);
 556         timer.data = (unsigned long) &sem;
 557         timer.expires = jiffies + timeout;
 558         timer.function = (void (*)(unsigned long)) scsi_sleep_done;
 559
 560         SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout));
 561
 562         add_timer(&timer);
 563
 564         down(&sem);
 565         del_timer(&timer);
 566 }
 567
 568 /*
 569  * Function:  scsi_send_eh_cmnd
 570  *
 571  * Purpose:     Send a command out to a device as part of error recovery.
 572  *
 573  * Notes:       The initialization of the structures is quite a bit different
 574  *              in this case, and furthermore, there is a different completion
 575  *              handler.
 576  */
 577 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
 578 {
 579         unsigned long flags;
 580         struct Scsi_Host *host;
 581
 582         ASSERT_LOCK(&io_request_lock, 0);
 583
 584         host = SCpnt->host;
 585
 586       retry:
 587         /*
 588          * We will use a queued command if possible, otherwise we will emulate the
 589          * queuing and calling of completion function ourselves.
 590          */
 591         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 592
 593         if (host->can_queue) {
 594                 DECLARE_MUTEX_LOCKED(sem);
 595
 596                 SCpnt->eh_state = SCSI_STATE_QUEUED;
 597
 598                 scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
 599
 600                 /*
 601                  * Set up the semaphore so we wait for the command to complete.
 602                  */
 603                 SCpnt->host->eh_action = &sem;
 604                 SCpnt->request.rq_status = RQ_SCSI_BUSY;
 605
 606                 spin_lock_irqsave(&io_request_lock, flags);
 607                 host->hostt->queuecommand(SCpnt, scsi_eh_done);
 608                 spin_unlock_irqrestore(&io_request_lock, flags);
 609
 610                 down(&sem);
 611
 612                 SCpnt->host->eh_action = NULL;
 613
 614                 /*
 615                  * See if timeout.  If so, tell the host to forget about it.
 616                  * In other words, we don't want a callback any more.
 617                  */
 618                 if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
 619                         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 620
 621                         /*
 622                          * As far as the low level driver is
 623                          * concerned, this command is still active, so
 624                          * we must give the low level driver a chance
 625                          * to abort it. (DB)
 626                          *
 627                          * FIXME(eric) - we are not tracking whether we could
 628                          * abort a timed out command or not.  Not sure how
 629                          * we should treat them differently anyways.
 630                          */
 631                         spin_lock_irqsave(&io_request_lock, flags);
 632                         if (SCpnt->host->hostt->eh_abort_handler)
 633                                 SCpnt->host->hostt->eh_abort_handler(SCpnt);
 634                         spin_unlock_irqrestore(&io_request_lock, flags);
 635
 636                         SCpnt->request.rq_status = RQ_SCSI_DONE;
 637                         SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
 638
 639                         SCpnt->eh_state = FAILED;
 640                 }
 641                 SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
 642                                                 SCpnt, SCpnt->eh_state));
 643         } else {
 644                 int temp;
 645
 646                 /*
 647                  * We damn well had better never use this code.  There is no timeout
 648                  * protection here, since we would end up waiting in the actual low
 649                  * level driver, we don't know how to wake it up.
 650                  */
 651                 spin_lock_irqsave(&io_request_lock, flags);
 652                 temp = host->hostt->command(SCpnt);
 653                 spin_unlock_irqrestore(&io_request_lock, flags);
 654
 655                 SCpnt->result = temp;
 656                 /* Fall through to code below to examine status. */
 657                 SCpnt->eh_state = SUCCESS;
 658         }
 659
 660         /*
 661          * Now examine the actual status codes to see whether the command actually
 662          * did complete normally.
 663          */
 664         if (SCpnt->eh_state == SUCCESS) {
 665                 int ret = scsi_eh_completed_normally(SCpnt);
 666                 SCSI_LOG_ERROR_RECOVERY(3,
 667                         printk("scsi_send_eh_cmnd: scsi_eh_completed_normally %x\n", ret));
 668                 switch (ret) {
 669                 case SUCCESS:
 670                         SCpnt->eh_state = SUCCESS;
 671                         break;
 672                 case NEEDS_RETRY:
 673                         if ((++SCpnt->retries) < SCpnt->allowed)
 674                                 goto retry;
 675                         SCpnt->eh_state = SUCCESS;
 676                         break;
 677                 case FAILED:
 678                 default:
 679                         SCpnt->eh_state = FAILED;
 680                         break;
 681                 }
 682         } else {
 683                 SCpnt->eh_state = FAILED;
 684         }
 685 }
 686
 687 /*
 688  * Function:  scsi_unit_is_ready()
 689  *
 690  * Purpose:     Called after TEST_UNIT_READY is run, to test to see if
 691  *              the unit responded in a way that indicates it is ready.
 692  */
 693 STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
 694 {
 695         if (SCpnt->result) {
 696                 if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
 697                      (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
 698                     ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
 699                         if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
 700                             ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
 701                             ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
 702                                 return 0;
 703                         }
 704                 }
 705         }
 706         return 1;
 707 }
 708
 709 /*
 710  * Function:    scsi_eh_finish_command
 711  *
 712  * Purpose:     Handle a command that we are finished with WRT error handling.
 713  *
 714  * Arguments:   SClist - pointer to list into which we are putting completed commands.
 715  *              SCpnt  - command that is completing
 716  *
 717  * Notes:       We don't want to use the normal command completion while we are
 718  *              are still handling errors - it may cause other commands to be queued,
 719  *              and that would disturb what we are doing.  Thus we really want to keep
 720  *              a list of pending commands for final completion, and once we
 721  *              are ready to leave error handling we handle completion for real.
 722  */
 723 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
 724 {
 725         SCpnt->state = SCSI_STATE_BHQUEUE;
 726         SCpnt->bh_next = *SClist;
 727         /*
 728          * Set this back so that the upper level can correctly free up
 729          * things.
 730          */
 731         SCpnt->use_sg = SCpnt->old_use_sg;
 732         SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
 733         SCpnt->underflow = SCpnt->old_underflow;
 734         *SClist = SCpnt;
 735 }
 736
 737 /*
 738  * Function:  scsi_try_to_abort_command
 739  *
 740  * Purpose:     Ask host adapter to abort a running command.
 741  *
 742  * Returns:     FAILED          Operation failed or not supported.
 743  *              SUCCESS         Succeeded.
 744  *
 745  * Notes:       This function will not return until the user's completion
 746  *              function has been called.  There is no timeout on this
 747  *              operation.  If the author of the low-level driver wishes
 748  *              this operation to be timed, they can provide this facility
 749  *              themselves.  Helper functions in scsi_error.c can be supplied
 750  *              to make this easier to do.
 751  *
 752  * Notes:       It may be possible to combine this with all of the reset
 753  *              handling to eliminate a lot of code duplication.  I don't
 754  *              know what makes more sense at the moment - this is just a
 755  *              prototype.
 756  */
 757 STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
 758 {
 759         int rtn;
 760         unsigned long flags;
 761
 762         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 763
 764         if (SCpnt->host->hostt->eh_abort_handler == NULL) {
 765                 return FAILED;
 766         }
 767         /*
 768          * scsi_done was called just after the command timed out and before
 769          * we had a chance to process it. (DB)
 770          */
 771         if (SCpnt->serial_number == 0)
 772                 return SUCCESS;
 773
 774         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 775
 776         spin_lock_irqsave(&io_request_lock, flags);
 777         rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
 778         spin_unlock_irqrestore(&io_request_lock, flags);
 779         return rtn;
 780 }
 781
 782 /*
 783  * Function:  scsi_try_bus_device_reset
 784  *
 785  * Purpose:     Ask host adapter to perform a bus device reset for a given
 786  *              device.
 787  *
 788  * Returns:     FAILED          Operation failed or not supported.
 789  *              SUCCESS         Succeeded.
 790  *
 791  * Notes:       There is no timeout for this operation.  If this operation is
 792  *              unreliable for a given host, then the host itself needs to put a
 793  *              timer on it, and set the host back to a consistent state prior
 794  *              to returning.
 795  */
 796 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
 797 {
 798         unsigned long flags;
 799         int rtn;
 800
 801         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 802
 803         if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
 804                 return FAILED;
 805         }
 806         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 807
 808         spin_lock_irqsave(&io_request_lock, flags);
 809         rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
 810         spin_unlock_irqrestore(&io_request_lock, flags);
 811
 812         if (rtn == SUCCESS)
 813                 SCpnt->eh_state = SUCCESS;
 814
 815         return SCpnt->eh_state;
 816 }
 817
 818 /*
 819  * Function:  scsi_try_bus_reset
 820  *
 821  * Purpose:     Ask host adapter to perform a bus reset for a host.
 822  *
 823  * Returns:     FAILED          Operation failed or not supported.
 824  *              SUCCESS         Succeeded.
 825  *
 826  * Notes:
 827  */
 828 STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
 829 {
 830         unsigned long flags;
 831         int rtn;
 832
 833         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 834         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 835         SCpnt->serial_number_at_timeout = SCpnt->serial_number;
 836
 837         if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
 838                 return FAILED;
 839         }
 840
 841         spin_lock_irqsave(&io_request_lock, flags);
 842         rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
 843         spin_unlock_irqrestore(&io_request_lock, flags);
 844
 845         if (rtn == SUCCESS)
 846                 SCpnt->eh_state = SUCCESS;
 847
 848         /*
 849          * If we had a successful bus reset, mark the command blocks to expect
 850          * a condition code of unit attention.
 851          */
 852         scsi_sleep(BUS_RESET_SETTLE_TIME);
 853         if (SCpnt->eh_state == SUCCESS) {
 854                 Scsi_Device *SDloop;
 855                 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
 856                         if (SCpnt->channel == SDloop->channel) {
 857                                 SDloop->was_reset = 1;
 858                                 SDloop->expecting_cc_ua = 1;
 859                         }
 860                 }
 861         }
 862         return SCpnt->eh_state;
 863 }
 864
 865 /*
 866  * Function:  scsi_try_host_reset
 867  *
 868  * Purpose:     Ask host adapter to reset itself, and the bus.
 869  *
 870  * Returns:     FAILED          Operation failed or not supported.
 871  *              SUCCESS         Succeeded.
 872  *
 873  * Notes:
 874  */
 875 STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
 876 {
 877         unsigned long flags;
 878         int rtn;
 879
 880         SCpnt->eh_state = FAILED;       /* Until we come up with something better */
 881         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
 882         SCpnt->serial_number_at_timeout = SCpnt->serial_number;
 883
 884         if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
 885                 return FAILED;
 886         }
 887         spin_lock_irqsave(&io_request_lock, flags);
 888         rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
 889         spin_unlock_irqrestore(&io_request_lock, flags);
 890
 891         if (rtn == SUCCESS)
 892                 SCpnt->eh_state = SUCCESS;
 893
 894         /*
 895          * If we had a successful host reset, mark the command blocks to expect
 896          * a condition code of unit attention.
 897          */
 898         scsi_sleep(HOST_RESET_SETTLE_TIME);
 899         if (SCpnt->eh_state == SUCCESS) {
 900                 Scsi_Device *SDloop;
 901                 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
 902                         SDloop->was_reset = 1;
 903                         SDloop->expecting_cc_ua = 1;
 904                 }
 905         }
 906         return SCpnt->eh_state;
 907 }
 908
 909 /*
 910  * Function:  scsi_decide_disposition
 911  *
 912  * Purpose:     Examine a command block that has come back from the low-level
 913  *              and figure out what to do next.
 914  *
 915  * Returns:     SUCCESS         - pass on to upper level.
 916  *              FAILED          - pass on to error handler thread.
 917  *              RETRY           - command should be retried.
 918  *              SOFTERR         - command succeeded, but we need to log
 919  *                                a soft error.
 920  *
 921  * Notes:       This is *ONLY* called when we are examining the status
 922  *              after sending out the actual data command.  Any commands
 923  *              that are queued for error recovery (i.e. TEST_UNIT_READY)
 924  *              do *NOT* come through here.
 925  *
 926  *              NOTE - When this routine returns FAILED, it means the error
 927  *              handler thread is woken.  In cases where the error code
 928  *              indicates an error that doesn't require the error handler
 929  *              thread (i.e. we don't need to abort/reset), then this function
 930  *              should return SUCCESS.
 931  */
 932 int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
 933 {
 934         int rtn;
 935
 936         /*
 937          * If the device is offline, then we clearly just pass the result back
 938          * up to the top level.
 939          */
 940         if (SCpnt->device->online == FALSE) {
 941                 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
 942                 return SUCCESS;
 943         }
 944         /*
 945          * First check the host byte, to see if there is anything in there
 946          * that would indicate what we need to do.
 947          */
 948
 949         switch (host_byte(SCpnt->result)) {
 950         case DID_PASSTHROUGH:
 951                 /*
 952                  * No matter what, pass this through to the upper layer.
 953                  * Nuke this special code so that it looks like we are saying
 954                  * DID_OK.
 955                  */
 956                 SCpnt->result &= 0xff00ffff;
 957                 return SUCCESS;
 958         case DID_OK:
 959                 /*
 960                  * Looks good.  Drop through, and check the next byte.
 961                  */
 962                 break;
 963         case DID_NO_CONNECT:
 964         case DID_BAD_TARGET:
 965         case DID_ABORT:
 966                 /*
 967                  * Note - this means that we just report the status back to the
 968                  * top level driver, not that we actually think that it indicates
 969                  * success.
 970                  */
 971                 return SUCCESS;
 972                 /*
 973                  * When the low level driver returns DID_SOFT_ERROR,
 974                  * it is responsible for keeping an internal retry counter
 975                  * in order to avoid endless loops (DB)
 976                  *
 977                  * Actually this is a bug in this function here.  We should
 978                  * be mindful of the maximum number of retries specified
 979                  * and not get stuck in a loop.
 980                  */
 981         case DID_SOFT_ERROR:
 982                 goto maybe_retry;
 983
 984         case DID_ERROR:
 985                 if (msg_byte(SCpnt->result) == COMMAND_COMPLETE &&
 986                     status_byte(SCpnt->result) == RESERVATION_CONFLICT)
 987                         /*
 988                          * execute reservation conflict processing code
 989                          * lower down
 990                          */
 991                         break;
 992                 /* FALLTHROUGH */
 993
 994         case DID_BUS_BUSY:
 995         case DID_PARITY:
 996                 goto maybe_retry;
 997         case DID_TIME_OUT:
 998                 /*
 999                  * When we scan the bus, we get timeout messages for
1000                  * these commands if there is no device available.
1001                  * Other hosts report DID_NO_CONNECT for the same thing.
1002                  */
1003                 if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
1004                      SCpnt->cmnd[0] == INQUIRY)) {
1005                         return SUCCESS;
1006                 } else {
1007                         return FAILED;
1008                 }
1009         case DID_RESET:
1010                 /*
1011                  * In the normal case where we haven't initiated a reset, this is
1012                  * a failure.
1013                  */
1014                 if (SCpnt->flags & IS_RESETTING) {
1015                         SCpnt->flags &= ~IS_RESETTING;
1016                         goto maybe_retry;
1017                 }
1018                 return SUCCESS;
1019         default:
1020                 return FAILED;
1021         }
1022
1023         /*
1024          * Next, check the message byte.
1025          */
1026         if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1027                 return FAILED;
1028         }
1029         /*
1030          * Now, check the status byte to see if this indicates anything special.
1031          */
1032         switch (status_byte(SCpnt->result)) {
1033         case QUEUE_FULL:
1034                 /*
1035                  * The case of trying to send too many commands to a tagged queueing
1036                  * device.
1037                  */
1038                 return ADD_TO_MLQUEUE;
1039         case GOOD:
1040         case COMMAND_TERMINATED:
1041                 return SUCCESS;
1042         case CHECK_CONDITION:
1043                 rtn = scsi_check_sense(SCpnt);
1044                 if (rtn == NEEDS_RETRY) {
1045                         goto maybe_retry;
1046                 }
1047                 return rtn;
1048         case CONDITION_GOOD:
1049         case INTERMEDIATE_GOOD:
1050         case INTERMEDIATE_C_GOOD:
1051                 /*
1052                  * Who knows?  FIXME(eric)
1053                  */
1054                 return SUCCESS;
1055         case BUSY:
1056                 goto maybe_retry;
1057
1058         case RESERVATION_CONFLICT:
1059                 printk("scsi%d (%d,%d,%d) : RESERVATION CONFLICT\n",
1060                        SCpnt->host->host_no, SCpnt->channel,
1061                        SCpnt->device->id, SCpnt->device->lun);
1062                 return SUCCESS; /* causes immediate I/O error */
1063         default:
1064                 return FAILED;
1065         }
1066         return FAILED;
1067
1068       maybe_retry:
1069
1070         if ((++SCpnt->retries) < SCpnt->allowed) {
1071                 return NEEDS_RETRY;
1072         } else {
1073                 /*
1074                  * No more retries - report this one back to upper level.
1075                  */
1076                 return SUCCESS;
1077         }
1078 }
1079
1080 /*
1081  * Function:  scsi_eh_completed_normally
1082  *
1083  * Purpose:     Examine a command block that has come back from the low-level
1084  *              and figure out what to do next.
1085  *
1086  * Returns:     SUCCESS         - pass on to upper level.
1087  *              FAILED          - pass on to error handler thread.
1088  *              RETRY           - command should be retried.
1089  *              SOFTERR         - command succeeded, but we need to log
1090  *                                a soft error.
1091  *
1092  * Notes:       This is *ONLY* called when we are examining the status
1093  *              of commands queued during error recovery.  The main
1094  *              difference here is that we don't allow for the possibility
1095  *              of retries here, and we are a lot more restrictive about what
1096  *              we consider acceptable.
1097  */
1098 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
1099 {
1100         /*
1101          * First check the host byte, to see if there is anything in there
1102          * that would indicate what we need to do.
1103          */
1104         if (host_byte(SCpnt->result) == DID_RESET) {
1105                 if (SCpnt->flags & IS_RESETTING) {
1106                         /*
1107                          * OK, this is normal.  We don't know whether in fact the
1108                          * command in question really needs to be rerun or not -
1109                          * if this was the original data command then the answer is yes,
1110                          * otherwise we just flag it as success.
1111                          */
1112                         SCpnt->flags &= ~IS_RESETTING;
1113                         return NEEDS_RETRY;
1114                 }
1115                 /*
1116                  * Rats.  We are already in the error handler, so we now get to try
1117                  * and figure out what to do next.  If the sense is valid, we have
1118                  * a pretty good idea of what to do.  If not, we mark it as failed.
1119                  */
1120                 return scsi_check_sense(SCpnt);
1121         }
1122         if (host_byte(SCpnt->result) != DID_OK) {
1123                 return FAILED;
1124         }
1125         /*
1126          * Next, check the message byte.
1127          */
1128         if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1129                 return FAILED;
1130         }
1131         /*
1132          * Now, check the status byte to see if this indicates anything special.
1133          */
1134         switch (status_byte(SCpnt->result)) {
1135         case GOOD:
1136         case COMMAND_TERMINATED:
1137                 return SUCCESS;
1138         case CHECK_CONDITION:
1139                 return scsi_check_sense(SCpnt);
1140         case CONDITION_GOOD:
1141         case INTERMEDIATE_GOOD:
1142         case INTERMEDIATE_C_GOOD:
1143                 /*
1144                  * Who knows?  FIXME(eric)
1145                  */
1146                 return SUCCESS;
1147         case BUSY:
1148         case QUEUE_FULL:
1149         case RESERVATION_CONFLICT:
1150         default:
1151                 return FAILED;
1152         }
1153         return FAILED;
1154 }
1155
1156 /*
1157  * Function:  scsi_check_sense
1158  *
1159  * Purpose:     Examine sense information - give suggestion as to what
1160  *              we should do with it.
1161  */
1162 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
1163 {
1164         if (!scsi_sense_valid(SCpnt)) {
1165                 return FAILED;
1166         }
1167         if (SCpnt->sense_buffer[2] & 0xe0)
1168                 return SUCCESS;
1169
1170         switch (SCpnt->sense_buffer[2] & 0xf) {
1171         case NO_SENSE:
1172                 return SUCCESS;
1173         case RECOVERED_ERROR:
1174                 return /* SOFT_ERROR */ SUCCESS;
1175
1176         case ABORTED_COMMAND:
1177                 return NEEDS_RETRY;
1178         case NOT_READY:
1179         case UNIT_ATTENTION:
1180                 /*
1181                  * If we are expecting a CC/UA because of a bus reset that we
1182                  * performed, treat this just as a retry.  Otherwise this is
1183                  * information that we should pass up to the upper-level driver
1184                  * so that we can deal with it there.
1185                  */
1186                 if (SCpnt->device->expecting_cc_ua) {
1187                         SCpnt->device->expecting_cc_ua = 0;
1188                         return NEEDS_RETRY;
1189                 }
1190                 /*
1191                  * If the device is in the process of becoming ready, we
1192                  * should retry.
1193                  */
1194                 if ((SCpnt->sense_buffer[12] == 0x04) &&
1195                         (SCpnt->sense_buffer[13] == 0x01)) {
1196                         return NEEDS_RETRY;
1197                 }
1198                 return SUCCESS;
1199
1200                 /* these three are not supported */
1201         case COPY_ABORTED:
1202         case VOLUME_OVERFLOW:
1203         case MISCOMPARE:
1204                 return SUCCESS;
1205
1206         case MEDIUM_ERROR:
1207                 return NEEDS_RETRY;
1208
1209         case ILLEGAL_REQUEST:
1210         case BLANK_CHECK:
1211         case DATA_PROTECT:
1212         case HARDWARE_ERROR:
1213         default:
1214                 return SUCCESS;
1215         }
1216 }
1217
1218
1219 /*
1220  * Function:  scsi_restart_operations
1221  *
1222  * Purpose:     Restart IO operations to the specified host.
1223  *
1224  * Arguments:   host  - host that we are restarting
1225  *
1226  * Lock status: Assumed that locks are not held upon entry.
1227  *
1228  * Returns:     Nothing
1229  *
1230  * Notes:       When we entered the error handler, we blocked all further
1231  *              I/O to this device.  We need to 'reverse' this process.
1232  */
1233 STATIC void scsi_restart_operations(struct Scsi_Host *host)
1234 {
1235         Scsi_Device *SDpnt;
1236         unsigned long flags;
1237
1238         ASSERT_LOCK(&io_request_lock, 0);
1239
1240         /*
1241          * Next free up anything directly waiting upon the host.  This will be
1242          * requests for character device operations, and also for ioctls to queued
1243          * block devices.
1244          */
1245         SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1246
1247         wake_up(&host->host_wait);
1248
1249         /*
1250          * Finally we need to re-initiate requests that may be pending.  We will
1251          * have had everything blocked while error handling is taking place, and
1252          * now that error recovery is done, we will need to ensure that these
1253          * requests are started.
1254          */
1255         spin_lock_irqsave(&io_request_lock, flags);
1256         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1257                 request_queue_t *q;
1258                 if ((host->can_queue > 0 && (host->host_busy >= host->can_queue))
1259                     || (host->host_blocked)
1260                     || (host->host_self_blocked)
1261                     || (SDpnt->device_blocked)) {
1262                         break;
1263                 }
1264                 q = &SDpnt->request_queue;
1265                 q->request_fn(q);
1266         }
1267         spin_unlock_irqrestore(&io_request_lock, flags);
1268 }
1269
1270 /*
1271  * Function:  scsi_unjam_host
1272  *
1273  * Purpose:     Attempt to fix a host which has a command that failed for
1274  *              some reason.
1275  *
1276  * Arguments:   host    - host that needs unjamming.
1277  *
1278  * Returns:     Nothing
1279  *
1280  * Notes:       When we come in here, we *know* that all commands on the
1281  *              bus have either completed, failed or timed out.  We also
1282  *              know that no further commands are being sent to the host,
1283  *              so things are relatively quiet and we have freedom to
1284  *              fiddle with things as we wish.
1285  *
1286  * Additional note:  This is only the *default* implementation.  It is possible
1287  *              for individual drivers to supply their own version of this
1288  *              function, and if the maintainer wishes to do this, it is
1289  *              strongly suggested that this function be taken as a template
1290  *              and modified.  This function was designed to correctly handle
1291  *              problems for about 95% of the different cases out there, and
1292  *              it should always provide at least a reasonable amount of error
1293  *              recovery.
1294  *
1295  * Note3:       Any command marked 'FAILED' or 'TIMEOUT' must eventually
1296  *              have scsi_finish_command() called for it.  We do all of
1297  *              the retry stuff here, so when we restart the host after we
1298  *              return it should have an empty queue.
1299  */
1300 STATIC int scsi_unjam_host(struct Scsi_Host *host)
1301 {
1302         int devices_failed;
1303         int numfailed;
1304         int ourrtn;
1305         int rtn = FALSE;
1306         int result;
1307         Scsi_Cmnd *SCloop;
1308         Scsi_Cmnd *SCpnt;
1309         Scsi_Device *SDpnt;
1310         Scsi_Device *SDloop;
1311         Scsi_Cmnd *SCdone;
1312         int timed_out;
1313
1314         ASSERT_LOCK(&io_request_lock, 0);
1315
1316         SCdone = NULL;
1317
1318         /*
1319          * First, protect against any sort of race condition.  If any of the outstanding
1320          * commands are in states that indicate that we are not yet blocked (i.e. we are
1321          * not in a quiet state) then we got woken up in error.  If we ever end up here,
1322          * we need to re-examine some of the assumptions.
1323          */
1324         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1325                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1326                         if (SCpnt->state == SCSI_STATE_FAILED
1327                             || SCpnt->state == SCSI_STATE_TIMEOUT
1328                             || SCpnt->state == SCSI_STATE_INITIALIZING
1329                             || SCpnt->state == SCSI_STATE_UNUSED) {
1330                                 continue;
1331                         }
1332                         /*
1333                          * Rats.  Something is still floating around out there.  This could
1334                          * be the result of the fact that the upper level drivers are still frobbing
1335                          * commands that might have succeeded.  There are two outcomes.  One is that
1336                          * the command block will eventually be freed, and the other one is that
1337                          * the command will be queued and will be finished along the way.
1338                          */
1339                         SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
1340
1341 /*
1342  *        panic("SCSI Error handler woken too early\n");
1343  *
1344  * This is no longer a problem, since now the code cares only about
1345  * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1346  * Other states are useful only to release active commands when devices are
1347  * set offline. If (host->host_active == host->host_busy) we can safely assume
1348  * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1349  *
1350  * FIXME:
1351  * It is not easy to release correctly commands according to their state when
1352  * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1353  * When a device is set offline, we can have some command with
1354  * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
1355  * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1356  * (DB, 17 May 1998)
1357  */
1358                 }
1359         }
1360
1361         /*
1362          * Next, see if we need to request sense information.  if so,
1363          * then get it now, so we have a better idea of what to do.
1364          * FIXME(eric) this has the unfortunate side effect that if a host
1365          * adapter does not automatically request sense information, that we end
1366          * up shutting it down before we request it.  All hosts should be doing this
1367          * anyways, so for now all I have to say is tough noogies if you end up in here.
1368          * On second thought, this is probably a good idea.  We *really* want to give
1369          * authors an incentive to automatically request this.
1370          */
1371         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1372
1373         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1374                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1375                       recheck_sense_valid:
1376                         if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
1377                                 continue;
1378                         }
1379                         SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1380                                                           SCpnt->target));
1381                         rtn = scsi_request_sense(SCpnt);
1382                         if (rtn != SUCCESS) {
1383                                 continue;
1384                         }
1385                         SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1386                                                   SCpnt, SCpnt->result));
1387                         SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
1388
1389                         result = scsi_decide_disposition(SCpnt);
1390
1391                         /*
1392                          * If the result was normal, then just pass it along to the
1393                          * upper level.
1394                          */
1395                         if (result == SUCCESS) {
1396                                 SCpnt->host->host_failed--;
1397                                 scsi_eh_finish_command(&SCdone, SCpnt);
1398                         }
1399                         if (result != NEEDS_RETRY) {
1400                                 continue;
1401                         }
1402                         /*
1403                          * We only come in here if we want to retry a
1404                          * command.  The test to see whether the command
1405                          * should be retried should be keeping track of the
1406                          * number of tries, so we don't end up looping, of
1407                          * course.
1408                          */
1409                         SCpnt->state = NEEDS_RETRY;
1410                         rtn = scsi_eh_retry_command(SCpnt);
1411                         if (rtn != SUCCESS) {
1412                                 SCpnt->state = SCSI_STATE_FAILED;
1413                                 goto recheck_sense_valid;
1414                         }
1415                         /*
1416                          * We eventually hand this one back to the top level.
1417                          */
1418                         SCpnt->host->host_failed--;
1419                         scsi_eh_finish_command(&SCdone, SCpnt);
1420                 }
1421         }
1422
1423         /*
1424          * Go through the list of commands and figure out where we stand and how bad things
1425          * really are.
1426          */
1427         numfailed = 0;
1428         timed_out = 0;
1429         devices_failed = 0;
1430         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1431                 unsigned int device_error = 0;
1432
1433                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1434                         if (SCpnt->state == SCSI_STATE_FAILED) {
1435                                 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1436                                                          SCpnt->target));
1437                                 numfailed++;
1438                                 device_error++;
1439                         }
1440                         if (SCpnt->state == SCSI_STATE_TIMEOUT) {
1441                                 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1442                                                          SCpnt->target));
1443                                 timed_out++;
1444                                 device_error++;
1445                         }
1446                 }
1447                 if (device_error > 0) {
1448                         devices_failed++;
1449                 }
1450         }
1451
1452         SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1453                                   numfailed, timed_out, devices_failed));
1454
1455         if (host->host_failed == 0) {
1456                 ourrtn = TRUE;
1457                 goto leave;
1458         }
1459         /*
1460          * Next, try and see whether or not it makes sense to try and abort
1461          * the running command.  This only works out to be the case if we have
1462          * one command that has timed out.  If the command simply failed, it
1463          * makes no sense to try and abort the command, since as far as the
1464          * host adapter is concerned, it isn't running.
1465          */
1466
1467         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1468
1469         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1470                 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1471                         if (SCloop->state != SCSI_STATE_TIMEOUT) {
1472                                 continue;
1473                         }
1474                         rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
1475                         if (rtn == SUCCESS) {
1476                                 rtn = scsi_test_unit_ready(SCloop);
1477
1478                                 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1479                                         rtn = scsi_eh_retry_command(SCloop);
1480
1481                                         if (rtn == SUCCESS) {
1482                                                 SCloop->host->host_failed--;
1483                                                 scsi_eh_finish_command(&SCdone, SCloop);
1484                                         }
1485                                 }
1486                         }
1487                 }
1488         }
1489
1490         /*
1491          * If we have corrected all of the problems, then we are done.
1492          */
1493         if (host->host_failed == 0) {
1494                 ourrtn = TRUE;
1495                 goto leave;
1496         }
1497         /*
1498          * Either the abort wasn't appropriate, or it didn't succeed.
1499          * Now try a bus device reset.  Still, look to see whether we have
1500          * multiple devices that are jammed or not - if we have multiple devices,
1501          * it makes no sense to try BUS_DEVICE_RESET - we really would need
1502          * to try a BUS_RESET instead.
1503          *
1504          * Does this make sense - should we try BDR on each device individually?
1505          * Yes, definitely.
1506          */
1507         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1508
1509         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1510                 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1511                         if (SCloop->state == SCSI_STATE_FAILED
1512                             || SCloop->state == SCSI_STATE_TIMEOUT) {
1513                                 break;
1514                         }
1515                 }
1516
1517                 if (SCloop == NULL) {
1518                         continue;
1519                 }
1520                 /*
1521                  * OK, we have a device that is having problems.  Try and send
1522                  * a bus device reset to it.
1523                  *
1524                  * FIXME(eric) - make sure we handle the case where multiple
1525                  * commands to the same device have failed. They all must
1526                  * get properly restarted.
1527                  */
1528                 rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
1529
1530                 if (rtn == SUCCESS) {
1531                         rtn = scsi_test_unit_ready(SCloop);
1532
1533                         if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1534                                 rtn = scsi_eh_retry_command(SCloop);
1535
1536                                 if (rtn == SUCCESS) {
1537                                         SCloop->host->host_failed--;
1538                                         scsi_eh_finish_command(&SCdone, SCloop);
1539                                 }
1540                         }
1541                 }
1542         }
1543
1544         if (host->host_failed == 0) {
1545                 ourrtn = TRUE;
1546                 goto leave;
1547         }
1548         /*
1549          * If we ended up here, we have serious problems.  The only thing left
1550          * to try is a full bus reset.  If someone has grabbed the bus and isn't
1551          * letting go, then perhaps this will help.
1552          */
1553         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1554
1555         /*
1556          * We really want to loop over the various channels, and do this on
1557          * a channel by channel basis.  We should also check to see if any
1558          * of the failed commands are on soft_reset devices, and if so, skip
1559          * the reset.
1560          */
1561         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1562               next_device:
1563                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1564                         if (SCpnt->state != SCSI_STATE_FAILED
1565                             && SCpnt->state != SCSI_STATE_TIMEOUT) {
1566                                 continue;
1567                         }
1568                         /*
1569                          * We have a failed command.  Make sure there are no other failed
1570                          * commands on the same channel that are timed out and implement a
1571                          * soft reset.
1572                          */
1573                         for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1574                                 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1575                                         if (SCloop->channel != SCpnt->channel) {
1576                                                 continue;
1577                                         }
1578                                         if (SCloop->state != SCSI_STATE_FAILED
1579                                             && SCloop->state != SCSI_STATE_TIMEOUT) {
1580                                                 continue;
1581                                         }
1582                                         if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
1583                                                 /*
1584                                                  * If this device uses the soft reset option, and this
1585                                                  * is one of the devices acting up, then our only
1586                                                  * option is to wait a bit, since the command is
1587                                                  * supposedly still running.
1588                                                  *
1589                                                  * FIXME(eric) - right now we will just end up falling
1590                                                  * through to the 'take device offline' case.
1591                                                  *
1592                                                  * FIXME(eric) - It is possible that the command completed
1593                                                  * *after* the error recovery procedure started, and if this
1594                                                  * is the case, we are worrying about nothing here.
1595                                                  */
1596
1597                                                 scsi_sleep(1 * HZ);
1598                                                 goto next_device;
1599                                         }
1600                                 }
1601                         }
1602
1603                         /*
1604                          * We now know that we are able to perform a reset for the
1605                          * bus that SCpnt points to.  There are no soft-reset devices
1606                          * with outstanding timed out commands.
1607                          */
1608                         rtn = scsi_try_bus_reset(SCpnt);
1609                         if (rtn == SUCCESS) {
1610                                 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1611                                         for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1612                                                 if (SCloop->channel != SCpnt->channel) {
1613                                                         continue;
1614                                                 }
1615                                                 if (SCloop->state != SCSI_STATE_FAILED
1616                                                     && SCloop->state != SCSI_STATE_TIMEOUT) {
1617                                                         continue;
1618                                                 }
1619                                                 rtn = scsi_test_unit_ready(SCloop);
1620
1621                                                 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1622                                                         rtn = scsi_eh_retry_command(SCloop);
1623
1624                                                         if (rtn == SUCCESS) {
1625                                                                 SCpnt->host->host_failed--;
1626                                                                 scsi_eh_finish_command(&SCdone, SCloop);
1627                                                         }
1628                                                 }
1629                                                 /*
1630                                                  * If the bus reset worked, but we are still unable to
1631                                                  * talk to the device, take it offline.
1632                                                  * FIXME(eric) - is this really the correct thing to do?
1633                                                  */
1634                                                 if (rtn != SUCCESS) {
1635                                                         printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after bus reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1636
1637                                                         SDloop->online = FALSE;
1638                                                         SDloop->host->host_failed--;
1639                                                         scsi_eh_finish_command(&SCdone, SCloop);
1640                                                 }
1641                                         }
1642                                 }
1643                         }
1644                 }
1645         }
1646
1647         if (host->host_failed == 0) {
1648                 ourrtn = TRUE;
1649                 goto leave;
1650         }
1651         /*
1652          * If we ended up here, we have serious problems.  The only thing left
1653          * to try is a full host reset - perhaps the firmware on the device
1654          * crashed, or something like that.
1655          *
1656          * It is assumed that a succesful host reset will cause *all* information
1657          * about the command to be flushed from both the host adapter *and* the
1658          * device.
1659          *
1660          * FIXME(eric) - it isn't clear that devices that implement the soft reset
1661          * option can ever be cleared except via cycling the power.  The problem is
1662          * that sending the host reset command will cause the host to forget
1663          * about the pending command, but the device won't forget.  For now, we
1664          * skip the host reset option if any of the failed devices are configured
1665          * to use the soft reset option.
1666          */
1667         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1668               next_device2:
1669                 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1670                         if (SCpnt->state != SCSI_STATE_FAILED
1671                             && SCpnt->state != SCSI_STATE_TIMEOUT) {
1672                                 continue;
1673                         }
1674                         if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
1675                                 /*
1676                                  * If this device uses the soft reset option, and this
1677                                  * is one of the devices acting up, then our only
1678                                  * option is to wait a bit, since the command is
1679                                  * supposedly still running.
1680                                  *
1681                                  * FIXME(eric) - right now we will just end up falling
1682                                  * through to the 'take device offline' case.
1683                                  */
1684                                 SCSI_LOG_ERROR_RECOVERY(3,
1685                                                         printk("scsi_unjam_host: Unable to try hard host reset\n"));
1686
1687                                 /*
1688                                  * Due to the spinlock, we will never get out of this
1689                                  * loop without a proper wait. (DB)
1690                                  */
1691                                 scsi_sleep(1 * HZ);
1692
1693                                 goto next_device2;
1694                         }
1695                         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1696
1697                         /*
1698                          * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1699                          */
1700                         rtn = scsi_try_host_reset(SCpnt);
1701                         if (rtn == SUCCESS) {
1702                                 /*
1703                                  * FIXME(eric) we assume that all commands are flushed from the
1704                                  * controller.  We should get a DID_RESET for all of the commands
1705                                  * that were pending.  We should ignore these so that we can
1706                                  * guarantee that we are in a consistent state.
1707                                  *
1708                                  * I believe this to be the case right now, but this needs to be
1709                                  * tested.
1710                                  */
1711                                 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1712                                         for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1713                                                 if (SCloop->state != SCSI_STATE_FAILED
1714                                                     && SCloop->state != SCSI_STATE_TIMEOUT) {
1715                                                         continue;
1716                                                 }
1717                                                 rtn = scsi_test_unit_ready(SCloop);
1718
1719                                                 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1720                                                         rtn = scsi_eh_retry_command(SCloop);
1721
1722                                                         if (rtn == SUCCESS) {
1723                                                                 SCpnt->host->host_failed--;
1724                                                                 scsi_eh_finish_command(&SCdone, SCloop);
1725                                                         }
1726                                                 }
1727                                                 if (rtn != SUCCESS) {
1728                                                         printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after host reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1729                                                         SDloop->online = FALSE;
1730                                                         SDloop->host->host_failed--;
1731                                                         scsi_eh_finish_command(&SCdone, SCloop);
1732                                                 }
1733                                         }
1734                                 }
1735                         }
1736                 }
1737         }
1738
1739         /*
1740          * If we solved all of the problems, then let's rev up the engines again.
1741          */
1742         if (host->host_failed == 0) {
1743                 ourrtn = TRUE;
1744                 goto leave;
1745         }
1746         /*
1747          * If the HOST RESET failed, then for now we assume that the entire host
1748          * adapter is too hosed to be of any use.  For our purposes, however, it is
1749          * easier to simply take the devices offline that correspond to commands
1750          * that failed.
1751          */
1752         SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1753
1754         for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1755                 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1756                         if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
1757                                 SDloop = SCloop->device;
1758                                 if (SDloop->online == TRUE) {
1759                                         printk(KERN_INFO "scsi: device set offline - command error recover failed: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1760                                         SDloop->online = FALSE;
1761                                 }
1762
1763                                 /*
1764                                  * This should pass the failure up to the top level driver, and
1765                                  * it will have to try and do something intelligent with it.
1766                                  */
1767                                 SCloop->host->host_failed--;
1768
1769                                 if (SCloop->state == SCSI_STATE_TIMEOUT) {
1770                                         SCloop->result |= (DRIVER_TIMEOUT << 24);
1771                                 }
1772                                 SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1773                                     SDloop->id, SCloop->result));
1774
1775                                 scsi_eh_finish_command(&SCdone, SCloop);
1776                         }
1777                 }
1778         }
1779
1780         if (host->host_failed != 0) {
1781                 panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1782         }
1783         SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1784
1785         ourrtn = FALSE;
1786
1787       leave:
1788
1789         /*
1790          * We should have a list of commands that we 'finished' during the course of
1791          * error recovery.  This should be the same as the list of commands that timed out
1792          * or failed.  We are currently holding these things in a linked list - we didn't
1793          * put them in the bottom half queue because we wanted to keep things quiet while
1794          * we were working on recovery, and passing them up to the top level could easily
1795          * cause the top level to try and queue something else again.
1796          *
1797          * Start by marking that the host is no longer in error recovery.
1798          */
1799         host->in_recovery = 0;
1800
1801         /*
1802          * Take the list of commands, and stick them in the bottom half queue.
1803          * The current implementation of scsi_done will do this for us - if need
1804          * be we can create a special version of this function to do the
1805          * same job for us.
1806          */
1807         for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
1808                 SCdone = SCpnt->bh_next;
1809                 SCpnt->bh_next = NULL;
1810                 /*
1811                  * Oh, this is a vile hack.  scsi_done() expects a timer
1812                  * to be running on the command.  If there isn't, it assumes
1813                  * that the command has actually timed out, and a timer
1814                  * handler is running.  That may well be how we got into
1815                  * this fix, but right now things are stable.  We add
1816                  * a timer back again so that we can report completion.
1817                  * scsi_done() will immediately remove said timer from
1818                  * the command, and then process it.
1819                  */
1820                 scsi_add_timer(SCpnt, 100, scsi_eh_times_out);
1821                 scsi_done(SCpnt);
1822         }
1823
1824         return (ourrtn);
1825 }
1826
1827
1828 /*
1829  * Function:  scsi_error_handler
1830  *
1831  * Purpose:     Handle errors/timeouts of scsi commands, try and clean up
1832  *              and unjam the bus, and restart things.
1833  *
1834  * Arguments:   host    - host for which we are running.
1835  *
1836  * Returns:     Never returns.
1837  *
1838  * Notes:       This is always run in the context of a kernel thread.  The
1839  *              idea is that we start this thing up when the kernel starts
1840  *              up (one per host that we detect), and it immediately goes to
1841  *              sleep and waits for some event (i.e. failure).  When this
1842  *              takes place, we have the job of trying to unjam the bus
1843  *              and restarting things.
1844  *
1845  */
1846 void scsi_error_handler(void *data)
1847 {
1848         struct Scsi_Host *host = (struct Scsi_Host *) data;
1849         int rtn;
1850         DECLARE_MUTEX_LOCKED(sem);
1851
1852         /*
1853          * We only listen to signals if the HA was loaded as a module.
1854          * If the HA was compiled into the kernel, then we don't listen
1855          * to any signals.
1856          */
1857         if( host->loaded_as_module ) {
1858         siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
1859         } else {
1860         siginitsetinv(&current->blocked, 0);
1861         }
1862
1863         lock_kernel();
1864
1865         /*
1866          *    Flush resources
1867          */
1868
1869         daemonize();
1870         reparent_to_init();
1871
1872         /*
1873          * Set the name of this process.
1874          */
1875
1876         sprintf(current->comm, "scsi_eh_%d", host->host_no);
1877
1878         host->eh_wait = &sem;
1879         host->ehandler = current;
1880
1881         unlock_kernel();
1882
1883         /*
1884          * Wake up the thread that created us.
1885          */
1886         SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", sem_getcount(host->eh_notify)));
1887
1888         up(host->eh_notify);
1889
1890         while (1) {
1891                 /*
1892                  * If we get a signal, it means we are supposed to go
1893                  * away and die.  This typically happens if the user is
1894                  * trying to unload a module.
1895                  */
1896                 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1897
1898                 /*
1899                  * Note - we always use down_interruptible with the semaphore
1900                  * even if the module was loaded as part of the kernel.  The
1901                  * reason is that down() will cause this thread to be counted
1902                  * in the load average as a running process, and down
1903                  * interruptible doesn't.  Given that we need to allow this
1904                  * thread to die if the driver was loaded as a module, using
1905                  * semaphores isn't unreasonable.
1906                  */
1907                 down_interruptible(&sem);
1908                 if( host->loaded_as_module ) {
1909                         if (signal_pending(current))
1910                                 break;
1911                 }
1912
1913                 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1914
1915                 host->eh_active = 1;
1916
1917                 /*
1918                  * We have a host that is failing for some reason.  Figure out
1919                  * what we need to do to get it up and online again (if we can).
1920                  * If we fail, we end up taking the thing offline.
1921                  */
1922                 if (host->hostt->eh_strategy_handler != NULL) {
1923                         rtn = host->hostt->eh_strategy_handler(host);
1924                 } else {
1925                         rtn = scsi_unjam_host(host);
1926                 }
1927
1928                 host->eh_active = 0;
1929
1930                 /*
1931                  * Note - if the above fails completely, the action is to take
1932                  * individual devices offline and flush the queue of any
1933                  * outstanding requests that may have been pending.  When we
1934                  * restart, we restart any I/O to any other devices on the bus
1935                  * which are still online.
1936                  */
1937                 scsi_restart_operations(host);
1938
1939         }
1940
1941         SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1942
1943         /*
1944          * Make sure that nobody tries to wake us up again.
1945          */
1946         host->eh_wait = NULL;
1947
1948         /*
1949          * Knock this down too.  From this point on, the host is flying
1950          * without a pilot.  If this is because the module is being unloaded,
1951          * that's fine.  If the user sent a signal to this thing, we are
1952          * potentially in real danger.
1953          */
1954         host->in_recovery = 0;
1955         host->eh_active = 0;
1956         host->ehandler = NULL;
1957
1958         /*
1959          * If anyone is waiting for us to exit (i.e. someone trying to unload
1960          * a driver), then wake up that process to let them know we are on
1961          * the way out the door.  This may be overkill - I *think* that we
1962          * could probably just unload the driver and send the signal, and when
1963          * the error handling thread wakes up that it would just exit without
1964          * needing to touch any memory associated with the driver itself.
1965          */
1966         if (host->eh_notify != NULL)
1967                 up(host->eh_notify);
1968 }
1969
1970 /*
1971  * Function:    scsi_new_reset
1972  *
1973  * Purpose:     Send requested reset to a bus or device at any phase.
1974  *
1975  * Arguments:   SCpnt   - command ptr to send reset with (usually a dummy)
1976  *              flag - reset type (see scsi.h)
1977  *
1978  * Returns:     SUCCESS/FAILURE.
1979  *
1980  * Notes:       This is used by the SCSI Generic driver to provide
1981  *              Bus/Device reset capability.
1982  */
1983 int
1984 scsi_new_reset(Scsi_Cmnd *SCpnt, int flag)
1985 {
1986         int rtn;
1987
1988         switch(flag) {
1989         case SCSI_TRY_RESET_DEVICE:
1990                 rtn = scsi_try_bus_device_reset(SCpnt, 0);
1991                 if (rtn == SUCCESS)
1992                         break;
1993                 /* FALLTHROUGH */
1994         case SCSI_TRY_RESET_BUS:
1995                 rtn = scsi_try_bus_reset(SCpnt);
1996                 if (rtn == SUCCESS)
1997                         break;
1998                 /* FALLTHROUGH */
1999         case SCSI_TRY_RESET_HOST:
2000                 rtn = scsi_try_host_reset(SCpnt);
2001                 break;
2002         default:
2003                 rtn = FAILED;
2004         }
2005
2006         return rtn;
2007 }
2008
2009 /*
2010  * Overrides for Emacs so that we follow Linus's tabbing style.
2011  * Emacs will notice this stuff at the end of the file and automatically
2012  * adjust the settings for this buffer only.  This must remain at the end
2013  * of the file.
2014  * ---------------------------------------------------------------------------
2015  * Local variables:
2016  * c-indent-level: 4
2017  * c-brace-imaginary-offset: 0
2018  * c-brace-offset: -4
2019  * c-argdecl-indent: 4
2020  * c-label-offset: -4
2021  * c-continued-statement-offset: 4
2022  * c-continued-brace-offset: 0
2023  * indent-tabs-mode: nil
2024  * tab-width: 8
2025  * End:
2026  */