1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #pragma dictionary "DISK" 26 27 #define P disk 28 29 fru P; 30 asru P; 31 32 /* 33 * Over all comments for this file: 34 * <disk-as-detector> The disk-as-detector DE provides the mapping between 35 * ereports generated by a kernel disk driver sd(7D) and resulting faults. 36 */ 37 38 /* 39 * SERD engine for media error fault propagation: 40 * 41 * This strategy is designed to give a file system, like ZFS, the 42 * ability to attempt data recovery/relocation without faulting a disk. 43 * This implementation depends on a file system retry to the same lba 44 * to trigger a fault when recovery/relocation is not possible. 45 * 46 * We let the engine propagate one error only once every 1 minute and then if we 47 * still get 2 or more * errors within 24 hours for the same LBA, there is a fault. 48 */ 49 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h; 50 51 /* 52 * disk-as-detector: fault events. 53 */ 54 event fault.io.scsi.cmd.disk.dev.rqs.derr@P; 55 event fault.io.scsi.cmd.disk.dev.rqs.merr@P, 56 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P; 57 58 /* 59 * The uderr fault will be defined at some future time. 60 * event fault.io.scsi.cmd.disk.dev.uderr@P; 61 */ 62 63 /* 64 * disk-as-detector: upset events. 65 * NOTE: For now we define an upset to implement discard. 66 */ 67 event upset.io.scsi.cmd.disk.dev.rqs.derr@P; 68 event upset.io.scsi.cmd.disk.dev.rqs.merr@P; 69 event upset.io.scsi.cmd.disk.dev.uderr@P; 70 event upset.io.scsi.cmd.disk.dev.serr@P; 71 event upset.io.scsi.cmd.disk.tran@P; 72 event upset.io.scsi.cmd.disk.recovered@P; 73 74 /* 75 * disk-as-detector: ereports from the kernel. 76 * 77 * We don't know the topology for all scsi disks, but the kernel will always 78 * generate ereport telemetry assuming that we do. We define these ereports 79 * with 'discard_if_config_unknown=1', which permits ereports against things 80 * with unknown topology to be silently discarded. The ereport data is logged 81 * in either case, and can be viewed via 'fmdump -eV'. 82 */ 83 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1; 84 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1; 85 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1; 86 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1; 87 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1; 88 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1; 89 90 /* 91 * For some ereports we let the 'driver-assessment', communicated as part of 92 * the ereport payload, determine fault .vs. upset via propagation constraints. 93 */ 94 #define DRIVER_ASSESSMENT_FATAL \ 95 (payloadprop_contains("driver-assessment", "fatal")) 96 #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL) 97 98 /* 99 * disk-as-detector: propagations from faults(based on 100 * DRIVER_ASSESSMENT_FATAL). 101 * We need to set additional fault payloads to indicate fault details. 102 * The payload we may need are listed as following: 103 * fault.io.scsi.cmd.disk.dev.rqs.derr 104 * op_code, key, asc, ascq 105 * fault.io.scsi.cmd.disk.dev.rqs.merr 106 * op_code, key, asc, ascq, lba 107 */ 108 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P-> 109 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL && 110 setpayloadprop("key", payloadprop("key")) && 111 setpayloadprop("asc", payloadprop("asc")) && 112 setpayloadprop("ascq", payloadprop("ascq"))}; 113 114 /* 115 * Utilize setserdsuffix with specific LBA, 116 * the serd engine would only trigger if the fault recurred on the same LBA 117 */ 118 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P-> 119 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL && 120 setserdsuffix(payloadprop("lba")) && 121 setpayloadprop("key", payloadprop("key")) && 122 setpayloadprop("asc", payloadprop("asc")) && 123 setpayloadprop("ascq", payloadprop("ascq")) && 124 setpayloadprop("lba", payloadprop("lba"))}; 125 126 /* 127 * NOTE: this propagation uses the "may" propagation of eversholt. 128 * The ereport need never exist. It's just a way of making 129 * the diagnosis wait for the within time on that ereport 130 * to complete. Once it has completed the diagnosis continues 131 * even though the dummy ereport didn't occur. 132 */ 133 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)}; 134 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) -> 135 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P; 136 137 /* 138 * The uderr fault will be propagated at some future time. 139 * prop fault.io.scsi.cmd.disk.dev.uderr@P-> 140 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL }; 141 */ 142 143 /* 144 * disk-as-detector: propagations from upsets(based on 145 * DRIVER_ASSESSMENT_NONFATAL). 146 */ 147 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P-> 148 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL }; 149 150 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P-> 151 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL }; 152 153 /* 154 * disk-as-detector: propagations from upsets(independent of 155 * driver-assessment) 156 */ 157 158 prop upset.io.scsi.cmd.disk.dev.serr@P-> 159 ereport.io.scsi.cmd.disk.dev.serr@P; 160 161 prop upset.io.scsi.cmd.disk.dev.uderr@P-> 162 ereport.io.scsi.cmd.disk.dev.uderr@P; 163 164 prop upset.io.scsi.cmd.disk.recovered@P-> 165 ereport.io.scsi.cmd.disk.recovered@P; 166 167 prop upset.io.scsi.cmd.disk.tran@P-> 168 ereport.io.scsi.cmd.disk.tran@P; 169 170 /* 171 * -------------------------------------- 172 * The remainder of this file contains rules associated with the operation of 173 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code. 174 * 175 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events 176 * generated by the disk-transport fmd module, and the resulting faults. 177 */ 178 179 /* 180 * Fault events. 181 */ 182 event fault.io.disk.over-temperature@P, 183 FITrate=10, FRU=P, ASRU=P; 184 event fault.io.disk.predictive-failure@P, FITrate=10, 185 FITrate=10, FRU=P, ASRU=P; 186 event fault.io.disk.self-test-failure@P, FITrate=10, 187 FITrate=10, FRU=P, ASRU=P; 188 189 /* 190 * ereports. 191 */ 192 event ereport.io.scsi.disk.over-temperature@P; 193 event ereport.io.scsi.disk.predictive-failure@P; 194 event ereport.io.scsi.disk.self-test-failure@P; 195 196 /* 197 * Propagations. 198 */ 199 prop fault.io.disk.over-temperature@P -> 200 ereport.io.scsi.disk.over-temperature@P; 201 202 prop fault.io.disk.self-test-failure@P -> 203 ereport.io.scsi.disk.self-test-failure@P; 204 205 prop fault.io.disk.predictive-failure@P -> 206 ereport.io.scsi.disk.predictive-failure@P { 207 setpayloadprop("asc", payloadprop("additional-sense-code")) && 208 setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };