aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorTejun Heo <htejun@gmail.com>2007-02-02 02:22:31 -0500
committerJeff Garzik <jeff@garzik.org>2007-02-21 04:58:16 -0500
commit7d47e8d4d4fb0c3d3bdc706759e70d5453b61ec3 (patch)
treef197f58dde8fbc795a37c9645b02490749f16e7a /drivers
parent4ae72a1e469a3bcfd3c1f77dac62392c489bf9ca (diff)
libata: put some intelligence into EH speed down sequence
The current EH speed down code is more of a proof that the EH framework is capable of adjusting transfer speed in response to error. This patch puts some intelligence into EH speed down sequence. The rules are.. * If there have been more than three timeout, HSM violation or unclassified DEV errors for known supported commands during last 10 mins, NCQ is turned off. * If there have been more than three timeout or HSM violation for known supported command, transfer mode is slowed down. If DMA is active, it is first slowered by one grade (e.g. UDMA133->100). If that doesn't help, it's slowered to 40c limit (UDMA33). If PIO is active, it's slowered by one grade first. If that doesn't help, PIO0 is forced. Note that this rule does not change transfer mode. DMA is never degraded into PIO by this rule. * If there have been more than ten ATA bus, timeout, HSM violation or unclassified device errors for known supported commands && speeding down DMA mode didn't help, the device is forced into PIO mode. Note that this rule is considered only for PATA devices and is pretty difficult to trigger. One error can only trigger one rule at a time. After a rule is triggered, error history is cleared such that the next speed down happens only after some number of errors are accumulated. This makes sense because now speed down is done in bigger stride. Signed-off-by: Tejun Heo <htejun@gmail.com> Signed-off-by: Jeff Garzik <jeff@garzik.org>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/ata/libata-eh.c184
1 files changed, 124 insertions, 60 deletions
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 1abfdba8d99b..31738627ec64 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -44,6 +44,12 @@
44 44
45#include "libata.h" 45#include "libata.h"
46 46
47enum {
48 ATA_EH_SPDN_NCQ_OFF = (1 << 0),
49 ATA_EH_SPDN_SPEED_DOWN = (1 << 1),
50 ATA_EH_SPDN_FALLBACK_TO_PIO = (1 << 2),
51};
52
47static void __ata_port_freeze(struct ata_port *ap); 53static void __ata_port_freeze(struct ata_port *ap);
48static void ata_eh_finish(struct ata_port *ap); 54static void ata_eh_finish(struct ata_port *ap);
49static void ata_eh_handle_port_suspend(struct ata_port *ap); 55static void ata_eh_handle_port_suspend(struct ata_port *ap);
@@ -65,12 +71,9 @@ static void ata_ering_record(struct ata_ering *ering, int is_io,
65 ent->timestamp = get_jiffies_64(); 71 ent->timestamp = get_jiffies_64();
66} 72}
67 73
68static struct ata_ering_entry * ata_ering_top(struct ata_ering *ering) 74static void ata_ering_clear(struct ata_ering *ering)
69{ 75{
70 struct ata_ering_entry *ent = &ering->ring[ering->cursor]; 76 memset(ering, 0, sizeof(*ering));
71 if (!ent->err_mask)
72 return NULL;
73 return ent;
74} 77}
75 78
76static int ata_ering_map(struct ata_ering *ering, 79static int ata_ering_map(struct ata_ering *ering,
@@ -1159,87 +1162,99 @@ static unsigned int ata_eh_analyze_tf(struct ata_queued_cmd *qc,
1159 return action; 1162 return action;
1160} 1163}
1161 1164
1162static int ata_eh_categorize_ering_entry(struct ata_ering_entry *ent) 1165static int ata_eh_categorize_error(int is_io, unsigned int err_mask)
1163{ 1166{
1164 if (ent->err_mask & (AC_ERR_ATA_BUS | AC_ERR_TIMEOUT)) 1167 if (err_mask & AC_ERR_ATA_BUS)
1165 return 1; 1168 return 1;
1166 1169
1167 if (ent->is_io) { 1170 if (err_mask & AC_ERR_TIMEOUT)
1168 if (ent->err_mask & AC_ERR_HSM) 1171 return 2;
1169 return 1; 1172
1170 if ((ent->err_mask & 1173 if (is_io) {
1171 (AC_ERR_DEV|AC_ERR_MEDIA|AC_ERR_INVALID)) == AC_ERR_DEV) 1174 if (err_mask & AC_ERR_HSM)
1172 return 2; 1175 return 2;
1176 if ((err_mask &
1177 (AC_ERR_DEV|AC_ERR_MEDIA|AC_ERR_INVALID)) == AC_ERR_DEV)
1178 return 3;
1173 } 1179 }
1174 1180
1175 return 0; 1181 return 0;
1176} 1182}
1177 1183
1178struct speed_down_needed_arg { 1184struct speed_down_verdict_arg {
1179 u64 since; 1185 u64 since;
1180 int nr_errors[3]; 1186 int nr_errors[4];
1181}; 1187};
1182 1188
1183static int speed_down_needed_cb(struct ata_ering_entry *ent, void *void_arg) 1189static int speed_down_verdict_cb(struct ata_ering_entry *ent, void *void_arg)
1184{ 1190{
1185 struct speed_down_needed_arg *arg = void_arg; 1191 struct speed_down_verdict_arg *arg = void_arg;
1192 int cat = ata_eh_categorize_error(ent->is_io, ent->err_mask);
1186 1193
1187 if (ent->timestamp < arg->since) 1194 if (ent->timestamp < arg->since)
1188 return -1; 1195 return -1;
1189 1196
1190 arg->nr_errors[ata_eh_categorize_ering_entry(ent)]++; 1197 arg->nr_errors[cat]++;
1191 return 0; 1198 return 0;
1192} 1199}
1193 1200
1194/** 1201/**
1195 * ata_eh_speed_down_needed - Determine wheter speed down is necessary 1202 * ata_eh_speed_down_verdict - Determine speed down verdict
1196 * @dev: Device of interest 1203 * @dev: Device of interest
1197 * 1204 *
1198 * This function examines error ring of @dev and determines 1205 * This function examines error ring of @dev and determines
1199 * whether speed down is necessary. Speed down is necessary if 1206 * whether NCQ needs to be turned off, transfer speed should be
1200 * there have been more than 3 of Cat-1 errors or 10 of Cat-2 1207 * stepped down, or falling back to PIO is necessary.
1201 * errors during last 15 minutes. 1208 *
1209 * Cat-1 is ATA_BUS error for any command.
1202 * 1210 *
1203 * Cat-1 errors are ATA_BUS, TIMEOUT for any command and HSM 1211 * Cat-2 is TIMEOUT for any command or HSM violation for known
1204 * violation for known supported commands. 1212 * supported commands.
1205 * 1213 *
1206 * Cat-2 errors are unclassified DEV error for known supported 1214 * Cat-3 is is unclassified DEV error for known supported
1207 * command. 1215 * command.
1208 * 1216 *
1217 * NCQ needs to be turned off if there have been more than 3
1218 * Cat-2 + Cat-3 errors during last 10 minutes.
1219 *
1220 * Speed down is necessary if there have been more than 3 Cat-1 +
1221 * Cat-2 errors or 10 Cat-3 errors during last 10 minutes.
1222 *
1223 * Falling back to PIO mode is necessary if there have been more
1224 * than 10 Cat-1 + Cat-2 + Cat-3 errors during last 5 minutes.
1225 *
1209 * LOCKING: 1226 * LOCKING:
1210 * Inherited from caller. 1227 * Inherited from caller.
1211 * 1228 *
1212 * RETURNS: 1229 * RETURNS:
1213 * 1 if speed down is necessary, 0 otherwise 1230 * OR of ATA_EH_SPDN_* flags.
1214 */ 1231 */
1215static int ata_eh_speed_down_needed(struct ata_device *dev) 1232static unsigned int ata_eh_speed_down_verdict(struct ata_device *dev)
1216{ 1233{
1217 const u64 interval = 15LLU * 60 * HZ; 1234 const u64 j5mins = 5LLU * 60 * HZ, j10mins = 10LLU * 60 * HZ;
1218 static const int err_limits[3] = { -1, 3, 10 }; 1235 u64 j64 = get_jiffies_64();
1219 struct speed_down_needed_arg arg; 1236 struct speed_down_verdict_arg arg;
1220 struct ata_ering_entry *ent; 1237 unsigned int verdict = 0;
1221 int err_cat;
1222 u64 j64;
1223 1238
1224 ent = ata_ering_top(&dev->ering); 1239 /* scan past 10 mins of error history */
1225 if (!ent) 1240 memset(&arg, 0, sizeof(arg));
1226 return 0; 1241 arg.since = j64 - min(j64, j10mins);
1242 ata_ering_map(&dev->ering, speed_down_verdict_cb, &arg);
1227 1243
1228 err_cat = ata_eh_categorize_ering_entry(ent); 1244 if (arg.nr_errors[2] + arg.nr_errors[3] > 3)
1229 if (err_cat == 0) 1245 verdict |= ATA_EH_SPDN_NCQ_OFF;
1230 return 0; 1246 if (arg.nr_errors[1] + arg.nr_errors[2] > 3 || arg.nr_errors[3] > 10)
1247 verdict |= ATA_EH_SPDN_SPEED_DOWN;
1231 1248
1249 /* scan past 3 mins of error history */
1232 memset(&arg, 0, sizeof(arg)); 1250 memset(&arg, 0, sizeof(arg));
1251 arg.since = j64 - min(j64, j5mins);
1252 ata_ering_map(&dev->ering, speed_down_verdict_cb, &arg);
1233 1253
1234 j64 = get_jiffies_64(); 1254 if (arg.nr_errors[1] + arg.nr_errors[2] + arg.nr_errors[3] > 10)
1235 if (j64 >= interval) 1255 verdict |= ATA_EH_SPDN_FALLBACK_TO_PIO;
1236 arg.since = j64 - interval;
1237 else
1238 arg.since = 0;
1239
1240 ata_ering_map(&dev->ering, speed_down_needed_cb, &arg);
1241 1256
1242 return arg.nr_errors[err_cat] > err_limits[err_cat]; 1257 return verdict;
1243} 1258}
1244 1259
1245/** 1260/**
@@ -1257,31 +1272,80 @@ static int ata_eh_speed_down_needed(struct ata_device *dev)
1257 * Kernel thread context (may sleep). 1272 * Kernel thread context (may sleep).
1258 * 1273 *
1259 * RETURNS: 1274 * RETURNS:
1260 * 0 on success, -errno otherwise 1275 * Determined recovery action.
1261 */ 1276 */
1262static int ata_eh_speed_down(struct ata_device *dev, int is_io, 1277static unsigned int ata_eh_speed_down(struct ata_device *dev, int is_io,
1263 unsigned int err_mask) 1278 unsigned int err_mask)
1264{ 1279{
1265 if (!err_mask) 1280 unsigned int verdict;
1281 unsigned int action = 0;
1282
1283 /* don't bother if Cat-0 error */
1284 if (ata_eh_categorize_error(is_io, err_mask) == 0)
1266 return 0; 1285 return 0;
1267 1286
1268 /* record error and determine whether speed down is necessary */ 1287 /* record error and determine whether speed down is necessary */
1269 ata_ering_record(&dev->ering, is_io, err_mask); 1288 ata_ering_record(&dev->ering, is_io, err_mask);
1289 verdict = ata_eh_speed_down_verdict(dev);
1270 1290
1271 if (!ata_eh_speed_down_needed(dev)) 1291 /* turn off NCQ? */
1272 return 0; 1292 if ((verdict & ATA_EH_SPDN_NCQ_OFF) &&
1293 (dev->flags & (ATA_DFLAG_PIO | ATA_DFLAG_NCQ |
1294 ATA_DFLAG_NCQ_OFF)) == ATA_DFLAG_NCQ) {
1295 dev->flags |= ATA_DFLAG_NCQ_OFF;
1296 ata_dev_printk(dev, KERN_WARNING,
1297 "NCQ disabled due to excessive errors\n");
1298 goto done;
1299 }
1300
1301 /* speed down? */
1302 if (verdict & ATA_EH_SPDN_SPEED_DOWN) {
1303 /* speed down SATA link speed if possible */
1304 if (sata_down_spd_limit(dev->ap) == 0) {
1305 action |= ATA_EH_HARDRESET;
1306 goto done;
1307 }
1273 1308
1274 /* speed down SATA link speed if possible */ 1309 /* lower transfer mode */
1275 if (sata_down_spd_limit(dev->ap) == 0) 1310 if (dev->spdn_cnt < 2) {
1276 return ATA_EH_HARDRESET; 1311 static const int dma_dnxfer_sel[] =
1312 { ATA_DNXFER_DMA, ATA_DNXFER_40C };
1313 static const int pio_dnxfer_sel[] =
1314 { ATA_DNXFER_PIO, ATA_DNXFER_FORCE_PIO0 };
1315 int sel;
1277 1316
1278 /* lower transfer mode */ 1317 if (dev->xfer_shift != ATA_SHIFT_PIO)
1279 if (ata_down_xfermask_limit(dev, ATA_DNXFER_ANY) == 0) 1318 sel = dma_dnxfer_sel[dev->spdn_cnt];
1280 return ATA_EH_SOFTRESET; 1319 else
1320 sel = pio_dnxfer_sel[dev->spdn_cnt];
1321
1322 dev->spdn_cnt++;
1323
1324 if (ata_down_xfermask_limit(dev, sel) == 0) {
1325 action |= ATA_EH_SOFTRESET;
1326 goto done;
1327 }
1328 }
1329 }
1330
1331 /* Fall back to PIO? Slowing down to PIO is meaningless for
1332 * SATA. Consider it only for PATA.
1333 */
1334 if ((verdict & ATA_EH_SPDN_FALLBACK_TO_PIO) && (dev->spdn_cnt >= 2) &&
1335 (dev->ap->cbl != ATA_CBL_SATA) &&
1336 (dev->xfer_shift != ATA_SHIFT_PIO)) {
1337 if (ata_down_xfermask_limit(dev, ATA_DNXFER_FORCE_PIO) == 0) {
1338 dev->spdn_cnt = 0;
1339 action |= ATA_EH_SOFTRESET;
1340 goto done;
1341 }
1342 }
1281 1343
1282 ata_dev_printk(dev, KERN_ERR,
1283 "speed down requested but no transfer mode left\n");
1284 return 0; 1344 return 0;
1345 done:
1346 /* device has been slowed down, blow error history */
1347 ata_ering_clear(&dev->ering);
1348 return action;
1285} 1349}
1286 1350
1287/** 1351/**