1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 #pragma dictionary "DISK" 27 28 #define P disk 29 30 fru P; 31 asru P; 32 33 /* 34 * Over all comments for this file: 35 * <disk-as-detector> The disk-as-detector DE provides the mapping between 36 * ereports generated by a kernel disk driver sd(7D) and resulting faults. 37 */ 38 39 /* 40 * SERD engine for media error fault propagation: 41 * 42 * This strategy is designed to give a file system, like ZFS, the 43 * ability to attempt data recovery/relocation without faulting a disk. 44 * This implementation depends on a file system retry to the same lba 45 * to trigger a fault when recovery/relocation is not possible. 46 * 47 * We let the engine propagate one error only once every 1 minute and then if we 48 * still get 2 or more errors within 24 hours for the same LBA, 49 * there is a fault. 50 */ 51 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h; 52 53 /* 54 * disk-as-detector: fault events. 55 */ 56 event fault.io.scsi.cmd.disk.dev.rqs.derr@P; 57 event fault.io.scsi.cmd.disk.dev.rqs.merr@P, 58 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P; 59 60 /* 61 * The uderr fault will be defined at some future time. 62 * event fault.io.scsi.cmd.disk.dev.uderr@P; 63 */ 64 65 /* 66 * disk-as-detector: upset events. 67 * NOTE: For now we define an upset to implement discard. 68 */ 69 event upset.io.scsi.cmd.disk.dev.rqs.derr@P; 70 event upset.io.scsi.cmd.disk.dev.rqs.merr@P; 71 event upset.io.scsi.cmd.disk.dev.uderr@P; 72 event upset.io.scsi.cmd.disk.dev.serr@P; 73 event upset.io.scsi.cmd.disk.tran@P; 74 event upset.io.scsi.cmd.disk.recovered@P; 75 76 /* 77 * disk-as-detector: ereports from the kernel. 78 * 79 * We don't know the topology for all scsi disks, but the kernel will always 80 * generate ereport telemetry assuming that we do. We define these ereports 81 * with 'discard_if_config_unknown=1', which permits ereports against things 82 * with unknown topology to be silently discarded. The ereport data is logged 83 * in either case, and can be viewed via 'fmdump -eV'. 84 */ 85 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1; 86 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1; 87 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1; 88 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1; 89 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1; 90 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1; 91 92 /* 93 * For some ereports we let the 'driver-assessment', communicated as part of 94 * the ereport payload, determine fault .vs. upset via propagation constraints. 95 */ 96 #define DRIVER_ASSESSMENT_FATAL \ 97 (payloadprop_contains("driver-assessment", "fatal")) 98 #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL) 99 100 /* 101 * disk-as-detector: propagations from faults(based on 102 * DRIVER_ASSESSMENT_FATAL). 103 * We need to set additional fault payloads to indicate fault details. 104 * The payload we may need are listed as following: 105 * fault.io.scsi.cmd.disk.dev.rqs.derr 106 * op_code, key, asc, ascq 107 * fault.io.scsi.cmd.disk.dev.rqs.merr 108 * op_code, key, asc, ascq, lba 109 */ 110 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P-> 111 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL && 112 setpayloadprop("key", payloadprop("key")) && 113 setpayloadprop("asc", payloadprop("asc")) && 114 setpayloadprop("ascq", payloadprop("ascq"))}; 115 116 /* 117 * Utilize setserdsuffix with specific LBA, 118 * the serd engine would only trigger if the fault recurred on the same LBA 119 */ 120 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P-> 121 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL && 122 setserdsuffix(payloadprop("lba")) && 123 setpayloadprop("key", payloadprop("key")) && 124 setpayloadprop("asc", payloadprop("asc")) && 125 setpayloadprop("ascq", payloadprop("ascq")) && 126 setpayloadprop("lba", payloadprop("lba"))}; 127 128 /* 129 * NOTE: this propagation uses the "may" propagation of eversholt. 130 * The ereport need never exist. It's just a way of making 131 * the diagnosis wait for the within time on that ereport 132 * to complete. Once it has completed the diagnosis continues 133 * even though the dummy ereport didn't occur. 134 */ 135 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)}; 136 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) -> 137 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P; 138 139 /* 140 * The uderr fault will be propagated at some future time. 141 * prop fault.io.scsi.cmd.disk.dev.uderr@P-> 142 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL }; 143 */ 144 145 /* 146 * disk-as-detector: propagations from upsets(based on 147 * DRIVER_ASSESSMENT_NONFATAL). 148 */ 149 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P-> 150 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL }; 151 152 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P-> 153 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL }; 154 155 /* 156 * disk-as-detector: propagations from upsets(independent of 157 * driver-assessment) 158 */ 159 160 prop upset.io.scsi.cmd.disk.dev.serr@P-> 161 ereport.io.scsi.cmd.disk.dev.serr@P; 162 163 prop upset.io.scsi.cmd.disk.dev.uderr@P-> 164 ereport.io.scsi.cmd.disk.dev.uderr@P; 165 166 prop upset.io.scsi.cmd.disk.recovered@P-> 167 ereport.io.scsi.cmd.disk.recovered@P; 168 169 prop upset.io.scsi.cmd.disk.tran@P-> 170 ereport.io.scsi.cmd.disk.tran@P; 171 172 /* 173 * -------------------------------------- 174 * The remainder of this file contains rules associated with the operation of 175 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code. 176 * 177 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events 178 * generated by the disk-transport fmd module, and the resulting faults. 179 */ 180 181 /* 182 * Fault events. 183 */ 184 event fault.io.disk.over-temperature@P, 185 FITrate=10, FRU=P, ASRU=P; 186 event fault.io.disk.predictive-failure@P, FITrate=10, 187 FITrate=10, FRU=P, ASRU=P; 188 event fault.io.disk.self-test-failure@P, FITrate=10, 189 FITrate=10, FRU=P, ASRU=P; 190 event fault.io.disk.ssm-wearout@P; 191 192 /* 193 * ereports. 194 */ 195 event ereport.io.scsi.disk.over-temperature@P; 196 event ereport.io.scsi.disk.predictive-failure@P; 197 event ereport.io.scsi.disk.self-test-failure@P; 198 event ereport.io.scsi.disk.ssm-wearout@P; 199 200 /* 201 * Propagations. 202 */ 203 prop fault.io.disk.over-temperature@P -> 204 ereport.io.scsi.disk.over-temperature@P; 205 206 prop fault.io.disk.self-test-failure@P -> 207 ereport.io.scsi.disk.self-test-failure@P; 208 209 prop fault.io.disk.predictive-failure@P -> 210 ereport.io.scsi.disk.predictive-failure@P { 211 setpayloadprop("asc", payloadprop("additional-sense-code")) && 212 setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) }; 213 214 prop fault.io.disk.ssm-wearout@P -> 215 ereport.io.scsi.disk.ssm-wearout@P { 216 setpayloadprop("current-wearout-percentage", 217 payloadprop("current-ssm-wearout")) 218 && setpayloadprop("threshold-wearout-percentage", 219 payloadprop("threshold-ssm-wearout")) };