ssm-wearout Old usr/src/cmd/fm/eversholt/files/common/disk.esc

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #pragma dictionary "DISK"
  26 
  27 #define P                       disk
  28 
  29 fru P;
  30 asru P;
  31 
  32 /*
  33  * Over all comments for this file:
  34  * <disk-as-detector> The disk-as-detector DE provides the mapping between
  35  * ereports generated by a kernel disk driver sd(7D) and resulting faults.
  36  */
  37 
  38 /*
  39  * SERD engine for media error fault propagation:
  40  *
  41  * This strategy is designed to give a file system, like ZFS, the
  42  * ability to attempt data recovery/relocation without faulting a disk.
  43  * This implementation depends on a file system retry to the same lba
  44  * to trigger a fault when recovery/relocation is not possible.
  45  *
  46  * We let the engine propagate one error only once every 1 minute and then if we
  47  * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
  48  */
  49 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
  50 
  51 /*
  52  * disk-as-detector: fault events.
  53  */
  54 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
  55 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
  56     engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
  57 
  58 /*
  59  * The uderr fault will be defined at some future time.
  60  * event fault.io.scsi.cmd.disk.dev.uderr@P;
  61  */
  62 
  63 /*
  64  * disk-as-detector: upset events.
  65  * NOTE: For now we define an upset to implement discard.
  66  */
  67 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
  68 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
  69 event upset.io.scsi.cmd.disk.dev.uderr@P;
  70 event upset.io.scsi.cmd.disk.dev.serr@P;
  71 event upset.io.scsi.cmd.disk.tran@P;
  72 event upset.io.scsi.cmd.disk.recovered@P;
  73 
  74 /*
  75  * disk-as-detector: ereports from the kernel.
  76  *
  77  * We don't know the topology for all scsi disks, but the kernel will always
  78  * generate ereport telemetry assuming that we do. We define these ereports
  79  * with 'discard_if_config_unknown=1', which permits ereports against things
  80  * with unknown topology to be silently discarded.  The ereport data is logged
  81  * in either case, and can be viewed via 'fmdump -eV'.
  82  */
  83 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
  84 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
  85 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
  86 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
  87 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
  88 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
  89 
  90 /*
  91  * For some ereports we let the 'driver-assessment', communicated as part of
  92  * the ereport payload, determine fault .vs. upset via propagation constraints.
  93  */
  94 #define DRIVER_ASSESSMENT_FATAL         \
  95             (payloadprop_contains("driver-assessment", "fatal"))
  96 #define DRIVER_ASSESSMENT_NONFATAL      (!DRIVER_ASSESSMENT_FATAL)
  97 
  98 /*
  99  * disk-as-detector: propagations from faults(based on
 100  * DRIVER_ASSESSMENT_FATAL).
 101  * We need to set additional fault payloads to indicate fault details.
 102  * The payload we may need are listed as following:
 103  * fault.io.scsi.cmd.disk.dev.rqs.derr
 104  *     op_code, key, asc, ascq
 105  * fault.io.scsi.cmd.disk.dev.rqs.merr
 106  *     op_code, key, asc, ascq, lba
 107  */
 108 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
 109     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
 110     setpayloadprop("key", payloadprop("key")) &&
 111     setpayloadprop("asc", payloadprop("asc")) &&
 112     setpayloadprop("ascq", payloadprop("ascq"))};
 113 
 114 /*
 115  * Utilize setserdsuffix with specific LBA, 
 116  * the serd engine would only trigger if the fault recurred on the same LBA
 117  */
 118 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
 119     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
 120     setserdsuffix(payloadprop("lba")) &&
 121     setpayloadprop("key", payloadprop("key")) &&
 122     setpayloadprop("asc", payloadprop("asc")) &&
 123     setpayloadprop("ascq", payloadprop("ascq")) &&
 124     setpayloadprop("lba", payloadprop("lba"))};
 125 
 126 /*
 127  * NOTE: this propagation uses the "may" propagation of eversholt.
 128  * The ereport need never exist. It's just a way of making
 129  * the diagnosis wait for the within time on that ereport
 130  * to complete. Once it has completed the diagnosis continues
 131  * even though the dummy ereport didn't occur.
 132  */
 133 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
 134 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
 135         ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
 136 
 137 /*
 138  * The uderr fault will be propagated at some future time.
 139  * prop fault.io.scsi.cmd.disk.dev.uderr@P->
 140  *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
 141  */
 142 
 143 /*
 144  * disk-as-detector: propagations from upsets(based on
 145  * DRIVER_ASSESSMENT_NONFATAL).
 146  */
 147 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
 148     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
 149 
 150 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
 151     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
 152 
 153 /*
 154  * disk-as-detector: propagations from upsets(independent of
 155  * driver-assessment)
 156  */
 157 
 158 prop upset.io.scsi.cmd.disk.dev.serr@P->
 159     ereport.io.scsi.cmd.disk.dev.serr@P;
 160 
 161 prop upset.io.scsi.cmd.disk.dev.uderr@P->
 162     ereport.io.scsi.cmd.disk.dev.uderr@P;
 163 
 164 prop upset.io.scsi.cmd.disk.recovered@P->
 165     ereport.io.scsi.cmd.disk.recovered@P;
 166 
 167 prop upset.io.scsi.cmd.disk.tran@P->
 168     ereport.io.scsi.cmd.disk.tran@P;
 169 
 170 /*
 171  * --------------------------------------
 172  * The remainder of this file contains rules associated with the operation of
 173  * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
 174  * 
 175  * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
 176  * generated by the disk-transport fmd module, and the resulting faults.
 177  */
 178 
 179 /*
 180  * Fault events.
 181  */
 182 event fault.io.disk.over-temperature@P,
 183     FITrate=10, FRU=P, ASRU=P;
 184 event fault.io.disk.predictive-failure@P, FITrate=10,
 185     FITrate=10, FRU=P, ASRU=P;
 186 event fault.io.disk.self-test-failure@P, FITrate=10,
 187     FITrate=10, FRU=P, ASRU=P;
 188 
 189 /*
 190  * ereports.
 191  */
 192 event ereport.io.scsi.disk.over-temperature@P;
 193 event ereport.io.scsi.disk.predictive-failure@P;
 194 event ereport.io.scsi.disk.self-test-failure@P;
 195 
 196 /*
 197  * Propagations.
 198  */
 199 prop fault.io.disk.over-temperature@P ->
 200     ereport.io.scsi.disk.over-temperature@P;
 201 
 202 prop fault.io.disk.self-test-failure@P ->
 203     ereport.io.scsi.disk.self-test-failure@P;
 204 
 205 prop fault.io.disk.predictive-failure@P ->
 206     ereport.io.scsi.disk.predictive-failure@P {
 207     setpayloadprop("asc", payloadprop("additional-sense-code")) &&
 208     setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };