ssm-wearout New usr/src/cmd/fm/eversholt/files/common/disk.esc

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 #pragma dictionary "DISK"
  27 
  28 #define P                       disk
  29 
  30 fru P;
  31 asru P;
  32 
  33 /*
  34  * Over all comments for this file:
  35  * <disk-as-detector> The disk-as-detector DE provides the mapping between
  36  * ereports generated by a kernel disk driver sd(7D) and resulting faults.
  37  */
  38 
  39 /*
  40  * SERD engine for media error fault propagation:
  41  *
  42  * This strategy is designed to give a file system, like ZFS, the
  43  * ability to attempt data recovery/relocation without faulting a disk.
  44  * This implementation depends on a file system retry to the same lba
  45  * to trigger a fault when recovery/relocation is not possible.
  46  *
  47  * We let the engine propagate one error only once every 1 minute and then if we
  48  * still get 2 or more errors within 24 hours for the same LBA,
  49  * there is a fault.
  50  */
  51 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
  52 
  53 /*
  54  * disk-as-detector: fault events.
  55  */
  56 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
  57 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
  58     engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
  59 
  60 /*
  61  * The uderr fault will be defined at some future time.
  62  * event fault.io.scsi.cmd.disk.dev.uderr@P;
  63  */
  64 
  65 /*
  66  * disk-as-detector: upset events.
  67  * NOTE: For now we define an upset to implement discard.
  68  */
  69 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
  70 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
  71 event upset.io.scsi.cmd.disk.dev.uderr@P;
  72 event upset.io.scsi.cmd.disk.dev.serr@P;
  73 event upset.io.scsi.cmd.disk.tran@P;
  74 event upset.io.scsi.cmd.disk.recovered@P;
  75 
  76 /*
  77  * disk-as-detector: ereports from the kernel.
  78  *
  79  * We don't know the topology for all scsi disks, but the kernel will always
  80  * generate ereport telemetry assuming that we do. We define these ereports
  81  * with 'discard_if_config_unknown=1', which permits ereports against things
  82  * with unknown topology to be silently discarded.  The ereport data is logged
  83  * in either case, and can be viewed via 'fmdump -eV'.
  84  */
  85 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
  86 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
  87 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
  88 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
  89 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
  90 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
  91 
  92 /*
  93  * For some ereports we let the 'driver-assessment', communicated as part of
  94  * the ereport payload, determine fault .vs. upset via propagation constraints.
  95  */
  96 #define DRIVER_ASSESSMENT_FATAL         \
  97             (payloadprop_contains("driver-assessment", "fatal"))
  98 #define DRIVER_ASSESSMENT_NONFATAL      (!DRIVER_ASSESSMENT_FATAL)
  99 
 100 /*
 101  * disk-as-detector: propagations from faults(based on
 102  * DRIVER_ASSESSMENT_FATAL).
 103  * We need to set additional fault payloads to indicate fault details.
 104  * The payload we may need are listed as following:
 105  * fault.io.scsi.cmd.disk.dev.rqs.derr
 106  *     op_code, key, asc, ascq
 107  * fault.io.scsi.cmd.disk.dev.rqs.merr
 108  *     op_code, key, asc, ascq, lba
 109  */
 110 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
 111     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
 112     setpayloadprop("key", payloadprop("key")) &&
 113     setpayloadprop("asc", payloadprop("asc")) &&
 114     setpayloadprop("ascq", payloadprop("ascq"))};
 115 
 116 /*
 117  * Utilize setserdsuffix with specific LBA, 
 118  * the serd engine would only trigger if the fault recurred on the same LBA
 119  */
 120 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
 121     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
 122     setserdsuffix(payloadprop("lba")) &&
 123     setpayloadprop("key", payloadprop("key")) &&
 124     setpayloadprop("asc", payloadprop("asc")) &&
 125     setpayloadprop("ascq", payloadprop("ascq")) &&
 126     setpayloadprop("lba", payloadprop("lba"))};
 127 
 128 /*
 129  * NOTE: this propagation uses the "may" propagation of eversholt.
 130  * The ereport need never exist. It's just a way of making
 131  * the diagnosis wait for the within time on that ereport
 132  * to complete. Once it has completed the diagnosis continues
 133  * even though the dummy ereport didn't occur.
 134  */
 135 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
 136 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
 137         ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
 138 
 139 /*
 140  * The uderr fault will be propagated at some future time.
 141  * prop fault.io.scsi.cmd.disk.dev.uderr@P->
 142  *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
 143  */
 144 
 145 /*
 146  * disk-as-detector: propagations from upsets(based on
 147  * DRIVER_ASSESSMENT_NONFATAL).
 148  */
 149 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
 150     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
 151 
 152 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
 153     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
 154 
 155 /*
 156  * disk-as-detector: propagations from upsets(independent of
 157  * driver-assessment)
 158  */
 159 
 160 prop upset.io.scsi.cmd.disk.dev.serr@P->
 161     ereport.io.scsi.cmd.disk.dev.serr@P;
 162 
 163 prop upset.io.scsi.cmd.disk.dev.uderr@P->
 164     ereport.io.scsi.cmd.disk.dev.uderr@P;
 165 
 166 prop upset.io.scsi.cmd.disk.recovered@P->
 167     ereport.io.scsi.cmd.disk.recovered@P;
 168 
 169 prop upset.io.scsi.cmd.disk.tran@P->
 170     ereport.io.scsi.cmd.disk.tran@P;
 171 
 172 /*
 173  * --------------------------------------
 174  * The remainder of this file contains rules associated with the operation of
 175  * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
 176  * 
 177  * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
 178  * generated by the disk-transport fmd module, and the resulting faults.
 179  */
 180 
 181 /*
 182  * Fault events.
 183  */
 184 event fault.io.disk.over-temperature@P,
 185     FITrate=10, FRU=P, ASRU=P;
 186 event fault.io.disk.predictive-failure@P, FITrate=10,
 187     FITrate=10, FRU=P, ASRU=P;
 188 event fault.io.disk.self-test-failure@P, FITrate=10,
 189     FITrate=10, FRU=P, ASRU=P;
 190 event fault.io.disk.ssm-wearout@P;
 191 
 192 /*
 193  * ereports.
 194  */
 195 event ereport.io.scsi.disk.over-temperature@P;
 196 event ereport.io.scsi.disk.predictive-failure@P;
 197 event ereport.io.scsi.disk.self-test-failure@P;
 198 event ereport.io.scsi.disk.ssm-wearout@P;
 199 
 200 /*
 201  * Propagations.
 202  */
 203 prop fault.io.disk.over-temperature@P ->
 204     ereport.io.scsi.disk.over-temperature@P;
 205 
 206 prop fault.io.disk.self-test-failure@P ->
 207     ereport.io.scsi.disk.self-test-failure@P;
 208 
 209 prop fault.io.disk.predictive-failure@P ->
 210     ereport.io.scsi.disk.predictive-failure@P {
 211     setpayloadprop("asc", payloadprop("additional-sense-code")) &&
 212     setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
 213 
 214 prop fault.io.disk.ssm-wearout@P ->
 215     ereport.io.scsi.disk.ssm-wearout@P {
 216     setpayloadprop("current-wearout-percentage",
 217     payloadprop("current-ssm-wearout"))
 218     && setpayloadprop("threshold-wearout-percentage",
 219     payloadprop("threshold-ssm-wearout")) };