BLASTing Linux Code

Jan Tobias Mühlberg and Gerald Lüttgen
Department of Computer Science, University of York, York YO10 5DD, U.K.


main page | next example


Commit Overview | Files | Comments

Checking Locking Properties: Example 4


Commit Overview

Commit Key abd559b1052e28d8b9c28aabde241f18fa89090b
Subject [PATCH] sbp2: fix deadlocks and delays on device removal/rmmod
Description Fixes for deadlocks of the ieee1394 and scsi subsystems and long delays in futile error recovery attempts when SBP-2 devices are removed or drivers are unloaded.
- Complete commands quickly with DID_NO_CONNECT if the 1394 node is gone or if the 1394 low-level driver was unloaded.
- Skip unnecessary work in the eh_abort_handler and eh_device_reset_handler if the node or 1394 low-level driver is gone.
- Let scsi's high-level shut down gracefully when sbp2 is being unloaded or detached from the 1394 unit. A call to scsi_remove_device is added for this purpose, which requires us to store a scsi_device pointer.
- scsi_device pointer is obtained from slave_alloc hook and cleared by slave_destroy. This avoids usage of the pointer after the scsi device was deleted e.g. by the user via scsi_mod's sysfs interface.
Requires Linux 2.6.14 kernel source as from git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/linux-2.6.14.y.git

--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -596,6 +596,14 @@ static void sbp2util_mark_command_comple
spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
}
+/*
+ * Is scsi_id valid? Is the 1394 node still present?
+ */
+static inline int sbp2util_node_is_available(struct scsi_id_instance_data *scsi_id)
+{
+ return scsi_id && scsi_id->ne && !scsi_id->ne->in_limbo;
+}
+
/*********************************************
@@ -631,11 +639,23 @@ static int sbp2_remove(struct device *de
{
struct unit_directory *ud;
struct scsi_id_instance_data *scsi_id;
+ struct scsi_device *sdev;
SBP2_DEBUG("sbp2_remove");
ud = container_of(dev, struct unit_directory, device);
scsi_id = ud->device.driver_data;
+ if (!scsi_id)
+ return 0;
+
+ /* Trigger shutdown functions in scsi's highlevel. */
+ if (scsi_id->scsi_host)
+ scsi_unblock_requests(scsi_id->scsi_host);
+ sdev = scsi_id->sdev;
+ if (sdev) {
+ scsi_id->sdev = NULL;
+ scsi_remove_device(sdev);
+ }
sbp2_logout_device(scsi_id);
sbp2_remove_device(scsi_id);
@@ -2473,37 +2493,26 @@ static int sbp2scsi_queuecommand(struct
struct scsi_id_instance_data *scsi_id =
(struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0];
struct sbp2scsi_host_info *hi;
+ int result = DID_NO_CONNECT << 16;
SBP2_DEBUG("sbp2scsi_queuecommand");
- /*
- * If scsi_id is null, it means there is no device in this slot,
- * so we should return selection timeout.
- */
- if (!scsi_id) {
- SCpnt->result = DID_NO_CONNECT << 16;
- done (SCpnt);
- return 0;
- }
+ if (!sbp2util_node_is_available(scsi_id))
+ goto done;
hi = scsi_id->hi;
if (!hi) {
SBP2_ERR("sbp2scsi_host_info is NULL - this is bad!");
- SCpnt->result = DID_NO_CONNECT << 16;
- done (SCpnt);
- return(0);
+ goto done;
}
/*
* Until we handle multiple luns, just return selection time-out
* to any IO directed at non-zero LUNs
*/
- if (SCpnt->device->lun) {
- SCpnt->result = DID_NO_CONNECT << 16;
- done (SCpnt);
- return(0);
- }
+ if (SCpnt->device->lun)
+ goto done;
/*
* Check for request sense command, and handle it here
@@ -2514,7 +2523,7 @@ static int sbp2scsi_queuecommand(struct
memcpy(SCpnt->request_buffer, SCpnt->sense_buffer, SCpnt->request_bufflen);
memset(SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
sbp2scsi_complete_command(scsi_id, SBP2_SCSI_STATUS_GOOD, SCpnt, done);
- return(0);
+ return 0;
}
/*
@@ -2522,9 +2531,8 @@ static int sbp2scsi_queuecommand(struct
*/
if (!hpsb_node_entry_valid(scsi_id->ne)) {
SBP2_ERR("Bus reset in progress - rejecting command");
- SCpnt->result = DID_BUS_BUSY << 16;
- done (SCpnt);
- return(0);
+ result = DID_BUS_BUSY << 16;
+ goto done;
}
/*
@@ -2535,8 +2543,12 @@ static int sbp2scsi_queuecommand(struct
sbp2scsi_complete_command(scsi_id, SBP2_SCSI_STATUS_SELECTION_TIMEOUT,
SCpnt, done);
}
+ return 0;
- return(0);
+done:
+ SCpnt->result = result;
+ done(SCpnt);
+ return 0;
}
/*
@@ -2683,14 +2695,27 @@ static void sbp2scsi_complete_command(st
}
-static int sbp2scsi_slave_configure (struct scsi_device *sdev)
+static int sbp2scsi_slave_alloc(struct scsi_device *sdev)
{
- blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
+ ((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = sdev;
+ return 0;
+}
+
+static int sbp2scsi_slave_configure(struct scsi_device *sdev)
+{
+ blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
return 0;
}
+static void sbp2scsi_slave_destroy(struct scsi_device *sdev)
+{
+ ((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = NULL;
+ return;
+}
+
+
/*
* Called by scsi stack when something has really gone wrong. Usually
* called when a command has timed-out for some reason.
@@ -2705,7 +2730,7 @@ static int sbp2scsi_abort(struct scsi_cm
SBP2_ERR("aborting sbp2 command");
scsi_print_command(SCpnt);
- if (scsi_id) {
+ if (sbp2util_node_is_available(scsi_id)) {
/*
* Right now, just return any matching command structures
@@ -2742,31 +2767,24 @@ static int sbp2scsi_abort(struct scsi_cm
/*
* Called by scsi stack when something has really gone wrong.
*/
-static int __sbp2scsi_reset(struct scsi_cmnd *SCpnt)
+static int sbp2scsi_reset(struct scsi_cmnd *SCpnt)
{
struct scsi_id_instance_data *scsi_id =
(struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0];
+ unsigned long flags;
SBP2_ERR("reset requested");
- if (scsi_id) {
+ spin_lock_irqsave(SCpnt->device->host->host_lock, flags);
+
+ if (sbp2util_node_is_available(scsi_id)) {
SBP2_ERR("Generating sbp2 fetch agent reset");
sbp2_agent_reset(scsi_id, 0);
}
- return(SUCCESS);
-}
-
-static int sbp2scsi_reset(struct scsi_cmnd *SCpnt)
-{
- unsigned long flags;
- int rc;
-
- spin_lock_irqsave(SCpnt->device->host->host_lock, flags);
- rc = __sbp2scsi_reset(SCpnt);
spin_unlock_irqrestore(SCpnt->device->host->host_lock, flags);
- return rc;
+ return SUCCESS;
}
static const char *sbp2scsi_info (struct Scsi_Host *host)
@@ -2817,7 +2835,9 @@ static struct scsi_host_template scsi_dr
.eh_device_reset_handler = sbp2scsi_reset,
.eh_bus_reset_handler = sbp2scsi_reset,
.eh_host_reset_handler = sbp2scsi_reset,
+ .slave_alloc = sbp2scsi_slave_alloc,
.slave_configure = sbp2scsi_slave_configure,
+ .slave_destroy = sbp2scsi_slave_destroy,
.this_id = -1,
.sg_tablesize = SG_ALL,
.use_clustering = ENABLE_CLUSTERING,

(purple: line numbers and function names; red: line removed; green: line added)

Files

Unmodified sources

Comments

This is an interesting case in which no detailed explanation of the deadlock that should be found in this source file is given. Despite trying several different attempts to model check simplifications of this source code, we did not find any locking related problems, either in the version of the source file used in Linux 2.6.13 nor in 2.6.14.

Source: http://www.kernel.org



Jan Tobias Mühlberg, $Date$